sched: Window-based load stat improvements

Some tasks can have a sporadic load pattern such that they can suddenly start running for longer intervals of time after running for shorter durations. To recognize such sharp increase in tasks' demands, max between the average of 5 window load samples and the most recent sample is chosen as the task demand. Make the window size (sched_ravg_window) configurable at boot up time. To prevent users from setting inappropriate values for window size, min and max limits are defined. As 'ravg' struct tracks load for both real-time and non real-time tasks it is moved out of sched_entity struct. In order to prevent changing function signatures for move_tasks() and move_one_task() per-cpu variables are defined to track the total load moved. In case multiple tasks are selected to migrate in one load balance operation, loads > 100 could be sent through migration notifiers. Prevent this scenario by setting mnd.load to 100 in such cases. Define wrapper functions to compute cpu demands for tasks and to change rq->cumulative_runnable_avg. Change-Id: I9abfbf3b5fe23ae615a6acd3db9580cfdeb515b4 Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Rohit Gupta <rohgup@codeaurora.org> [rameezmustafa@codeaurora.org: Port to msm-3.18 and squash "dcf7256 sched: window-stats: Fix overflow bug" into this patch.] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in __migrate_task().] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Srivatsa Vaddagiri <vatsa@codeaurora.org> 2014-03-29 11:40:16 -0700
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 19:58:53 -0700
commit: 3967da2dd1f97b8d558207d8d720d699aca3d566 (patch)
tree: 8ca85ed41c471a7d8e5a19b7e440499bddf7bc8b /kernel
parent: e3fe80da05042802fc16f828919dfe3621a34d96 (diff)
4 files changed, 132 insertions, 84 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b1d48c53bf7e..d7f43c26e0c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -860,7 +860,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 1);
-	rq->cumulative_runnable_avg += p->se.ravg.demand;
+	inc_cumulative_runnable_avg(rq, p);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -870,8 +870,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 0);
-	rq->cumulative_runnable_avg -= p->se.ravg.demand;
-	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+	dec_cumulative_runnable_avg(rq, p);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1150,11 +1149,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = dest_cpu;
-		if (sysctl_sched_ravg_window)
-			 mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-			 	(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
 	}
@@ -1757,6 +1752,15 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+__read_mostly unsigned int min_sched_ravg_window = 10000000;
+
+/* Max window size (in ns) = 1s */
+__read_mostly unsigned int max_sched_ravg_window = 1000000000;
+
 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
@@ -1766,9 +1770,9 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 static inline void
 update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 {
-	u32 *hist = &p->se.ravg.sum_history[0];
+	u32 *hist = &p->ravg.sum_history[0];
 	int ridx, widx;
-	u32 max = 0;
+	u32 sum = 0, avg;
 
 	/* Ignore windows where task had no activity */
 	if (!runtime)
@@ -1779,86 +1783,97 @@ update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 	ridx = widx - samples;
 	for (; ridx >= 0; --widx, --ridx) {
 		hist[widx] = hist[ridx];
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
 	for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) {
 		hist[widx] = runtime;
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
-	p->se.ravg.sum = 0;
+	p->ravg.sum = 0;
 	if (p->on_rq) {
-		rq->cumulative_runnable_avg -= p->se.ravg.demand;
+		rq->cumulative_runnable_avg -= p->ravg.demand;
 		BUG_ON((s64)rq->cumulative_runnable_avg < 0);
 	}
-	/*
-	 * Maximum demand seen over previous RAVG_HIST_SIZE windows drives
-	 * frequency demand for a task. Record maximum in 'demand' attribute.
-	 */
-	p->se.ravg.demand = max;
+
+	avg = sum / RAVG_HIST_SIZE;
+
+	p->ravg.demand = max(avg, runtime);
+
 	if (p->on_rq)
-		rq->cumulative_runnable_avg += p->se.ravg.demand;
+		rq->cumulative_runnable_avg += p->ravg.demand;
 }
 
-/* Window size (in ns) */
-__read_mostly unsigned int sysctl_sched_ravg_window = 50000000;
+static int __init set_sched_ravg_window(char *str)
+{
+	get_option(&str, &sched_ravg_window);
+
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
 
 void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum)
 {
-	u32 window_size = sysctl_sched_ravg_window;
+	u32 window_size = sched_ravg_window;
 	int new_window;
 	u64 wallclock = sched_clock();
 
+	if (sched_ravg_window < min_sched_ravg_window)
+		return;
+
 	do {
 		s64 delta = 0;
 		int n;
 		u64 now = wallclock;
 
 		new_window = 0;
-		delta = now - p->se.ravg.window_start;
+		delta = now - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		if (delta > window_size) {
-			p->se.ravg.window_start += window_size;
-			now = p->se.ravg.window_start;
+			p->ravg.window_start += window_size;
+			now = p->ravg.window_start;
 			new_window = 1;
 		}
 
 		if (update_sum) {
-			delta = now - p->se.ravg.mark_start;
+			unsigned int cur_freq = rq->cur_freq;
+
+			delta = now - p->ravg.mark_start;
 			BUG_ON(delta < 0);
 
-			if (likely(rq->cur_freq &&
-					rq->cur_freq <= max_possible_freq))
-				delta = div64_u64(delta  * rq->cur_freq,
+			if (unlikely(cur_freq > max_possible_freq))
+				cur_freq = max_possible_freq;
+
+			delta = div64_u64(delta  * cur_freq,
 							max_possible_freq);
-			p->se.ravg.sum += delta;
-			WARN_ON(p->se.ravg.sum > window_size);
+			p->ravg.sum += delta;
+			WARN_ON(p->ravg.sum > window_size);
 		}
 
 		if (!new_window)
 			break;
 
-		update_history(rq, p, p->se.ravg.sum, 1);
+		update_history(rq, p, p->ravg.sum, 1);
 
-		delta = wallclock - p->se.ravg.window_start;
+		delta = wallclock - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		n = div64_u64(delta, window_size);
 		if (n) {
 			if (!update_sum)
-				p->se.ravg.window_start = wallclock;
+				p->ravg.window_start = wallclock;
 			else
-				p->se.ravg.window_start += n * window_size;
-			BUG_ON(p->se.ravg.window_start > wallclock);
+				p->ravg.window_start += (u64)n *
+							 (u64)window_size;
+			BUG_ON(p->ravg.window_start > wallclock);
 			if (update_sum)
 				update_history(rq, p, window_size, n);
 		}
-		p->se.ravg.mark_start =  p->se.ravg.window_start;
+		p->ravg.mark_start =  p->ravg.window_start;
 	} while (new_window);
 
-	p->se.ravg.mark_start = wallclock;
+	p->ravg.mark_start = wallclock;
 }
 
 /*
@@ -2162,11 +2177,8 @@ out:
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = cpu;
-		if (sysctl_sched_ravg_window)
-			mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-				(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
+
 		/*
 		 * Call the migration notifier with mnd for foreground task
 		 * migrations as well as for wakeups if their load is above
@@ -2281,8 +2293,6 @@ void __dl_clear_params(struct task_struct *p)
  */
 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
-	int i;
-
 	p->on_rq			= 0;
 
 	p->se.on_rq			= 0;
@@ -2291,12 +2301,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
-	p->se.ravg.sum			= 0;
-	p->se.ravg.demand		= 0;
-	p->se.ravg.window_start		= 0;
-	p->se.ravg.mark_start		= 0;
-	for (i = 0; i < RAVG_HIST_SIZE; ++i)
-		p->se.ravg.sum_history[i] = 0;
+	init_new_task_load(p);
 
 	INIT_LIST_HEAD(&p->se.group_node);
 
@@ -2564,7 +2569,6 @@ void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
-	u64 wallclock = sched_clock();
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/* Initialize new task's runnable average */
@@ -2580,8 +2584,6 @@ void wake_up_new_task(struct task_struct *p)
 
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->se.ravg.window_start	= wallclock;
-	p->se.ravg.mark_start	= wallclock;
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -7577,6 +7579,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	if (val != CPUFREQ_POSTCHANGE)
 		return 0;
 
+	BUG_ON(!new_freq);
 	cpu_rq(cpu)->cur_freq = new_freq;
 
 	return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbf3d3697322..f62f9b7fd049 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2449,6 +2449,39 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+static inline unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+	unsigned int load;
+
+	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+	return load;
+}
+
+void init_new_task_load(struct task_struct *p)
+{
+	int i;
+	u64 wallclock = sched_clock();
+
+	p->ravg.sum			= 0;
+	p->ravg.demand			= 0;
+	p->ravg.window_start		= wallclock;
+	p->ravg.mark_start		= wallclock;
+	for (i = 0; i < RAVG_HIST_SIZE; ++i)
+		p->ravg.sum_history[i] = 0;
+}
+
 #ifdef CONFIG_SMP
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
@@ -5558,6 +5591,7 @@ struct lb_env {
 };
 
 static DEFINE_PER_CPU(bool, dbs_boost_needed);
+static DEFINE_PER_CPU(int, dbs_boost_load_moved);
 
 /*
  * Is this task likely cache-hot:
@@ -5746,7 +5780,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  *
  * Returns a task if successful and NULL otherwise.
  */
-static struct task_struct *detach_one_task(struct lb_env *env, int *total_run_moved)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
@@ -5765,10 +5799,7 @@ static struct task_struct *detach_one_task(struct lb_env *env, int *total_run_mo
 		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 		return p;
 	}
@@ -5783,7 +5814,7 @@ static const unsigned int sched_nr_migrate_break = 32;
  *
  * Returns number of detached tasks if successful and 0 otherwise.
  */
-static int detach_tasks(struct lb_env *env, int *total_run_moved)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
@@ -5833,9 +5864,7 @@ static int detach_tasks(struct lb_env *env, int *total_run_moved)
 
 		detached++;
 		env->imbalance -= load;
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -6975,7 +7004,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			int *continue_balancing)
 {
 	int ld_moved = 0, cur_ld_moved, active_balance = 0;
-	int total_run_moved = 0;
 	struct sched_domain *sd_parent = sd->parent;
 	struct sched_group *group = NULL;
 	struct rq *busiest = NULL;
@@ -7004,6 +7032,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
+	per_cpu(dbs_boost_load_moved, this_cpu) = 0;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
@@ -7049,7 +7078,7 @@ more_balance:
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = detach_tasks(&env, &total_run_moved);
+		cur_ld_moved = detach_tasks(&env);
 
 		/*
 		 * We've detached some tasks from busiest_rq. Every
@@ -7187,13 +7216,16 @@ more_balance:
 		if (per_cpu(dbs_boost_needed, this_cpu)) {
 			struct migration_notify_data mnd;
 
-			per_cpu(dbs_boost_needed, this_cpu) = false;
-
 			mnd.src_cpu = cpu_of(busiest);
 			mnd.dest_cpu = this_cpu;
-			mnd.load = total_run_moved;
+			mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
+			if (mnd.load > 100)
+				mnd.load = 100;
 			atomic_notifier_call_chain(&migration_notifier_head,
 						   0, (void *)&mnd);
+			per_cpu(dbs_boost_needed, this_cpu) = false;
+			per_cpu(dbs_boost_load_moved, this_cpu) = 0;
+
 		}
 	}
 	if (likely(!active_balance)) {
@@ -7391,13 +7423,14 @@ static int active_load_balance_cpu_stop(void *data)
 	struct rq *busiest_rq = data;
 	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
-	int total_run_moved = 0;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
 	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
+	per_cpu(dbs_boost_load_moved, target_cpu) = 0;
+
 	/* make sure the requested cpu hasn't gone down in the meantime */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
@@ -7434,7 +7467,7 @@ static int active_load_balance_cpu_stop(void *data)
 
 		schedstat_inc(sd, alb_count);
 
-		p = detach_one_task(&env, &total_run_moved);
+		p = detach_one_task(&env);
 		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
@@ -7453,13 +7486,16 @@ out_unlock:
 	if (per_cpu(dbs_boost_needed, target_cpu)) {
 		struct migration_notify_data mnd;
 
-		per_cpu(dbs_boost_needed, target_cpu) = false;
-
 		mnd.src_cpu = cpu_of(busiest_rq);
 		mnd.dest_cpu = target_cpu;
-		mnd.load = total_run_moved;
+		mnd.load = per_cpu(dbs_boost_load_moved, target_cpu);
+		if (mnd.load > 100)
+			mnd.load = 100;
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
+
+		per_cpu(dbs_boost_needed, target_cpu) = false;
+		per_cpu(dbs_boost_load_moved, target_cpu) = 0;
 	}
 	return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b85da1bb4b49..fc3f624ca8c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -35,7 +35,6 @@ extern void update_cpu_load_active(struct rq *this_rq);
 static inline void update_cpu_load_active(struct rq *this_rq) { }
 #endif
 
-extern unsigned int sysctl_sched_ravg_window;
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -912,6 +911,23 @@ static inline void sched_ttwu_pending(void) { }
 #include "stats.h"
 #include "auto_group.h"
 
+extern unsigned int sched_ravg_window;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern void init_new_task_load(struct task_struct *p);
+
+static inline void
+inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg -= p->ravg.demand;
+	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1314618f07f8..292979f360d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -293,13 +293,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.procname	= "sched_ravg_window",
-		.data		= &sysctl_sched_ravg_window,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
 		.procname	= "sched_wakeup_load_threshold",
 		.data		= &sysctl_sched_wakeup_load_threshold,
 		.maxlen		= sizeof(unsigned int),
author	Srivatsa Vaddagiri <vatsa@codeaurora.org>	2014-03-29 11:40:16 -0700
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 19:58:53 -0700
commit	3967da2dd1f97b8d558207d8d720d699aca3d566 (patch)
tree	8ca85ed41c471a7d8e5a19b7e440499bddf7bc8b /kernel
parent	e3fe80da05042802fc16f828919dfe3621a34d96 (diff)