Merge "sched: Avoid waking idle cpu for short-burst tasks"

author: Linux Build Service Account <lnxbuild@localhost> 2016-12-19 17:04:54 -0800
committer: Gerrit - the friendly Code Review server <code-review@localhost> 2016-12-19 17:04:54 -0800
commit: e26b0777dc92ab47d16b0134e57cd1c9e1083a9a (patch)
tree: 6896cfb7a25310fb4f871b86d74f939d9727c3d4 /kernel
parent: af833ae6a86db02de5cd1347779470485f80f5ce (diff)
parent: 0dee0d1411e4ba837089a769a5bcce57a5a14df2 (diff)
5 files changed, 82 insertions, 9 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14acefa27ec1..87538f7d495a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2613,6 +2613,7 @@ static u32 __compute_runnable_contrib(u64 n)
 #define SBC_FLAG_CSTATE_LOAD				0x100
 #define SBC_FLAG_BEST_SIBLING				0x200
 #define SBC_FLAG_WAKER_CPU				0x400
+#define SBC_FLAG_PACK_TASK				0x800
 
 /* Cluster selection flag */
 #define SBC_FLAG_COLOC_CLUSTER				0x10000
@@ -2629,6 +2630,7 @@ struct cpu_select_env {
 	u8 sync:1;
 	u8 ignore_prev_cpu:1;
 	enum sched_boost_policy boost_policy;
+	u8 pack_task:1;
 	int prev_cpu;
 	DECLARE_BITMAP(candidate_list, NR_CPUS);
 	DECLARE_BITMAP(backup_list, NR_CPUS);
@@ -2980,8 +2982,17 @@ static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 {
 	int cpu_cost;
 
-	cpu_cost = power_cost(cpu, task_load(env->p) +
+	/*
+	 * We try to find the least loaded *busy* CPU irrespective
+	 * of the power cost.
+	 */
+	if (env->pack_task)
+		cpu_cost = cpu_min_power_cost(cpu);
+
+	else
+		cpu_cost = power_cost(cpu, task_load(env->p) +
 				cpu_cravg_sync(cpu, env->sync));
+
 	if (cpu_cost <= stats->min_cost)
 		__update_cluster_stats(cpu, stats, env, cpu_cost);
 }
@@ -3056,6 +3067,15 @@ static inline int wake_to_idle(struct task_struct *p)
 		 (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle;
 }
 
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+	if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+	    env->reason)
+		return true;
+
+	return false;
+}
+
 static inline bool
 bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 {
@@ -3063,9 +3083,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 	struct task_struct *task = env->p;
 	struct sched_cluster *cluster;
 
-	if (env->boost_policy != SCHED_BOOST_NONE || env->reason ||
-	    !task->ravg.mark_start ||
-	    env->need_idle || !sched_short_sleep_task_threshold)
+	if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
 		return false;
 
 	prev_cpu = env->prev_cpu;
@@ -3114,8 +3132,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 static inline bool
 wake_to_waker_cluster(struct cpu_select_env *env)
 {
-	return env->boost_policy == SCHED_BOOST_NONE &&
-	       !env->need_idle && !env->reason && env->sync &&
+	return env->sync &&
 	       task_load(current) > sched_big_waker_task_load &&
 	       task_load(env->p) < sched_small_wakee_task_load;
 }
@@ -3140,7 +3157,6 @@ cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
 	return !cpumask_empty(&tmp_mask);
 }
 
-
 /* return cheapest cpu that can fit this task */
 static int select_best_cpu(struct task_struct *p, int target, int reason,
 			   int sync)
@@ -3150,6 +3166,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	struct related_thread_group *grp;
 	unsigned int sbc_flag = 0;
 	int cpu = raw_smp_processor_id();
+	bool special;
 
 	struct cpu_select_env env = {
 		.p			= p,
@@ -3162,6 +3179,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		.rtg			= NULL,
 		.sbc_best_flag		= 0,
 		.sbc_best_cluster_flag	= 0,
+		.pack_task              = false,
 	};
 
 	env.boost_policy = task_sched_boost(p) ?
@@ -3171,6 +3189,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	bitmap_zero(env.backup_list, NR_CPUS);
 
 	init_cluster_cpu_stats(&stats);
+	special = env_has_special_flags(&env);
 
 	rcu_read_lock();
 
@@ -3182,7 +3201,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 			clear_bit(pref_cluster->id, env.candidate_list);
 		else
 			env.rtg = grp;
-	} else {
+	} else if (!special) {
 		cluster = cpu_rq(cpu)->cluster;
 		if (wake_to_waker_cluster(&env)) {
 			if (bias_to_waker_cpu(p, cpu)) {
@@ -3203,6 +3222,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		}
 	}
 
+	if (!special && is_short_burst_task(p)) {
+		env.pack_task = true;
+		sbc_flag = SBC_FLAG_PACK_TASK;
+	}
 retry:
 	cluster = select_least_power_cluster(&env);
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index b2f3013bfe31..95e618ee1124 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -961,6 +961,13 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
 
 unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
 
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds, it is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+
 static void
 _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
 {
@@ -1553,7 +1560,13 @@ void init_new_task_load(struct task_struct *p, bool idle_task)
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;
 	p->ravg.curr_burst = 0;
-	p->ravg.avg_burst = 0;
+	/*
+	 * Initialize the avg_burst to twice the threshold, so that
+	 * a task would not be classified as short burst right away
+	 * after fork. It takes at least 6 sleep-wakeup cycles for
+	 * the avg_burst to go below the threshold.
+	 */
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
 
 	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
 	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
@@ -2987,6 +3000,8 @@ void reset_task_stats(struct task_struct *p)
 	p->ravg.curr_window_cpu = curr_window_ptr;
 	p->ravg.prev_window_cpu = prev_window_ptr;
 
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f8bc34c31c42..3fe00d6fa335 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1723,6 +1723,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 	int i;
 	int restrict_cluster;
 	int boost_on_big;
+	int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
 
 	boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
 			sched_boost_policy() == SCHED_BOOST_ON_BIG;
@@ -1739,6 +1740,8 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return best_cpu; /* No targets found */
 
+	pack_task = is_short_burst_task(task);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
@@ -1764,6 +1767,20 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 			if (!restrict_cluster)
 				cpu_load = scale_load_to_cpu(cpu_load, i);
 
+			if (pack_task) {
+				wakeup_latency = cpu_rq(i)->wakeup_latency;
+
+				if (wakeup_latency > least_wakeup_latency)
+					continue;
+
+				if (wakeup_latency < least_wakeup_latency) {
+					least_wakeup_latency = wakeup_latency;
+					min_load = cpu_load;
+					best_cpu = i;
+					continue;
+				}
+			}
+
 			if (cpu_load < min_load ||
 				(cpu_load == min_load &&
 				(i == prev_cpu || (best_cpu != prev_cpu &&
@@ -1772,6 +1789,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 				best_cpu = i;
 			}
 		}
+
 		if (restrict_cluster && best_cpu != -1)
 			break;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bcf3f019a300..ae7442007e8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1216,6 +1216,11 @@ static inline int cpu_max_power_cost(int cpu)
 	return cpu_rq(cpu)->cluster->max_power_cost;
 }
 
+static inline int cpu_min_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_power_cost;
+}
+
 static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
 {
 	return div64_u64(cycles, period);
@@ -1413,6 +1418,11 @@ static inline u64 cpu_cravg_sync(int cpu, int sync)
 	return load;
 }
 
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+	return p->ravg.avg_burst < sysctl_sched_short_burst;
+}
+
 extern void check_for_migration(struct rq *rq, struct task_struct *p);
 extern void pre_big_task_count_change(const struct cpumask *cpus);
 extern void post_big_task_count_change(const struct cpumask *cpus);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c272e31f37ea..ba69f4c96d7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -507,6 +507,13 @@ static struct ctl_table kern_table[] = {
 		.extra1         = &zero,
 		.extra2		= &three,
 	},
+	{
+		.procname	= "sched_short_burst_ns",
+		.data		= &sysctl_sched_short_burst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif	/* CONFIG_SCHED_HMP */
 #ifdef CONFIG_SCHED_DEBUG
 	{
author	Linux Build Service Account <lnxbuild@localhost>	2016-12-19 17:04:54 -0800
committer	Gerrit - the friendly Code Review server <code-review@localhost>	2016-12-19 17:04:54 -0800
commit	e26b0777dc92ab47d16b0134e57cd1c9e1083a9a (patch)
tree	6896cfb7a25310fb4f871b86d74f939d9727c3d4 /kernel
parent	af833ae6a86db02de5cd1347779470485f80f5ce (diff)
parent	0dee0d1411e4ba837089a769a5bcce57a5a14df2 (diff)