sched: Use only partial wait time as task demand

The scheduler currently either considers a tasks entire wait time as task demand or completely ignores wait time based on the tunable sched_account_wait_time. Both approaches have their limitations, however. The former artificially boosts tasks demand when it may not actually be justified. With the latter, the scheduler runs the risk of never being able to recognize true load (consider two CPU hogs on a single little CPU). To achieve a compromise between these two extremes, change the load tracking algorithm to only consider part of a tasks wait time as its demand. The portion of wait time accounted as demand is determined by each tasks percent load, i.e. a task that waits for 10ms and has 60 % task load, only 6 ms of the wait will contribute to task demand. This approach is more fair as the scheduler now tries to determine how much of its wait time would a task actually have been using the CPU if it had been executing. It ensures that tasks with high demand continue to see most of the benefits of accounting wait time as busy time, however, lower demand tasks don't experience a disproportionately high boost to demand triggering unjustified big CPU usage. Note that this new approach is only applicable to wait time being considered as task demand and not wait time considered as CPU busy time. To achieve the above effect, ensure that anytime a task is waiting, its runtime in every relevant window segment is appropriately adjusted using its pct load. Change-Id: I6a698d6cb1adeca49113c3499029b422daf7871f Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
author: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> 2015-02-27 16:12:01 -0800
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 20:01:55 -0700
commit: 38f3da47d7ac04be3727f65f07558da26b2d4068 (patch)
tree: 2c272b850e9a8f223ed3a1992dcdcc0988331d68
parent: 1cac3260d4e8fd180a7c30408c5f4ffb7b7ec4d1 (diff)
4 files changed, 38 insertions, 12 deletions
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index d9e6972fecc3..97072a8bee02 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -1173,8 +1173,17 @@ Appears at: /proc/sys/kernel/sched_account_wait_time
 
 Default value: 1
 
-This controls whether a task's wait time is accounted as its demand for cpu
+This controls whether a tasks wait time is accounted as its demand for cpu
 and thus the values found in its sum, sum_history[] and demand attributes.
+The load tracking algorithm only considers part of a tasks wait time as its
+demand. The portion of wait time accounted as demand is determined by each
+tasks percent load, i.e. a task that waits for 10ms and has 60 % task load,
+only 6 ms of the wait will contribute to task demand. This approach is fair
+as the scheduler tries to determine how much of its wait time would a task
+actually have been using the CPU if it had been executing. It ensures that
+tasks with high demand continue to see most of the benefits of accounting
+wait time as busy time, however, lower demand tasks don't experience a
+disproportionately high boost to demand.
 
 *** 7.16 sched_freq_account_wait_time
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da806ebac086..c38c2a05eb3d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1635,19 +1635,23 @@ static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 
 static int account_busy_for_task_demand(struct task_struct *p, int event)
 {
-	/* No need to bother updating task demand for exiting tasks
-	 * or the idle task. */
 	if (exiting_task(p) || is_idle_task(p))
 		return 0;
 
-	/* When a task is waking up it is completing a segment of non-busy
+	/*
+	 * When a task is waking up it is completing a segment of non-busy
 	 * time. Likewise, if wait time is not treated as busy time, then
 	 * when a task begins to run or is migrated, it is not running and
-	 * is completing a segment of non-busy time. */
+	 * is completing a segment of non-busy time.
+	 */
 	if (event == TASK_WAKE || (!sched_account_wait_time &&
-			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		(event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
 		return 0;
 
+	/*
+	 * We are left with TASK_UPDATE, IRQ_UPDATE, PUT_PREV_TASK and
+	 * wait time being accounted as busy time.
+	 */
 	return 1;
 }
 
@@ -1719,6 +1723,15 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p,
 		p->ravg.sum = sched_ravg_window;
 }
 
+static u64 wait_adjust(struct task_struct *p, u64 delta, int event)
+{
+	/* We already know that wait time counts as busy time. */
+	if (event == PICK_NEXT_TASK || event == TASK_MIGRATE)
+		return div64_u64(delta * task_load(p), max_task_load());
+
+	return delta;
+}
+
 /*
  * Account cpu demand of task and/or update task's cpu demand history
  *
@@ -1793,7 +1806,8 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	if (!new_window) {
 		/* The simple case - busy time contained within the existing
 		 * window. */
-		add_to_task_demand(rq, p, wallclock - mark_start);
+		add_to_task_demand(rq, p, wait_adjust(p,
+				wallclock - mark_start, event));
 		return;
 	}
 
@@ -1804,13 +1818,14 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	window_start -= (u64)nr_full_windows * (u64)window_size;
 
 	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
+	add_to_task_demand(rq, p,
+		wait_adjust(p, window_start - mark_start, event));
 
 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
 	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
-			       nr_full_windows, event);
+		update_history(rq, p, scale_exec_time(wait_adjust(p,
+		window_size, event), rq), nr_full_windows, event);
 
 	/* Roll window_start back to current to process any remainder
 	 * in current window. */
@@ -1818,7 +1833,8 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 
 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
+	add_to_task_demand(rq, p,
+		wait_adjust(p, wallclock - mark_start, event));
 }
 
 /* Reflect task activity on its demand and cpu's busy time statistics */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7d456ba3960..6da8a188ee30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2684,7 +2684,7 @@ unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
 unsigned int __read_mostly sysctl_sched_min_runtime = 0; /* 0 ms */
 u64 __read_mostly sched_min_runtime = 0; /* 0 ms */
 
-static inline unsigned int task_load(struct task_struct *p)
+unsigned int task_load(struct task_struct *p)
 {
 	if (sched_use_pelt)
 		return p->se.avg.runnable_avg_sum_scaled;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1d675545817e..e485b120ff00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1003,6 +1003,7 @@ extern u64 scale_load_to_cpu(u64 load, int cpu);
 extern unsigned int sched_heavy_task;
 extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
 extern void fixup_nr_big_small_task(int cpu, int reset_stats);
+extern unsigned int task_load(struct task_struct *p);
 extern unsigned int max_task_load(void);
 extern void sched_account_irqtime(int cpu, struct task_struct *curr,
 				 u64 delta, u64 wallclock);
author	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	2015-02-27 16:12:01 -0800
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 20:01:55 -0700
commit	38f3da47d7ac04be3727f65f07558da26b2d4068 (patch)
tree	2c272b850e9a8f223ed3a1992dcdcc0988331d68
parent	1cac3260d4e8fd180a7c30408c5f4ffb7b7ec4d1 (diff)