sched: window-stats: Enhance cpu busy time accounting

rq->curr/prev_runnable_sum counters represent cpu demand from various tasks that have run on a cpu. Any task that runs on a cpu will have a representation in rq->curr_runnable_sum. Their partial_demand value will be included in rq->curr_runnable_sum. Since partial_demand is derived from historical load samples for a task, rq->curr_runnable_sum could represent "inflated/un-realistic" cpu usage. As an example, lets say that task with partial_demand of 10ms runs for only 1ms on a cpu. What is included in rq->curr_runnable_sum is 10ms (and not the actual execution time of 1ms). This leads to cpu busy time being reported on the upside causing frequency to stay higher than necessary. This patch fixes cpu busy accounting scheme to strictly represent actual usage. It also provides for conditional fixup of busy time upon migration and upon heavy-task wakeup. CRs-Fixed: 691443 Change-Id: Ic4092627668053934049af4dfef65d9b6b901e6b Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in init_task_load(), se.avg.decay_count has deprecated.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Srivatsa Vaddagiri <vatsa@codeaurora.org> 2014-09-01 13:26:53 +0530
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 20:00:50 -0700
commit: 3a67b4ce87bf96312d4c5728047d830a66258854 (patch)
tree: bd392503fe5c0f2da5231f1865cc86fd8b7b0672
parent: 977dc392f77522559697cbedfb2d48ad7be96aec (diff)
9 files changed, 821 insertions, 515 deletions
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index fd62f091f995..e15ddcba1f73 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -27,7 +27,7 @@ config CPU_FREQ_BOOST_SW
 
 config SCHED_FREQ_INPUT
 	bool "Scheduler inputs to cpufreq governor"
-	depends on SMP && FAIR_GROUP_SCHED
+	depends on SCHED_HMP
 	help
 	  This option enables support for scheduler based CPU utilization
 	  calculations which may then be used by any cpufreq governor. The
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 09974bc791ea..31a9756b06dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -314,6 +314,8 @@ extern char ___assert_task_state[1 - 2*!!(
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
+extern const char *sched_window_reset_reasons[];
+
 enum task_event {
 	PUT_PREV_TASK   = 0,
 	PICK_NEXT_TASK  = 1,
@@ -1212,7 +1214,7 @@ struct sched_avg {
 	u64 last_update_time, load_sum;
 	u32 util_sum, period_contrib;
 	unsigned long load_avg, util_avg;
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 	u32 runnable_avg_sum_scaled;
 #endif
 };
@@ -1273,12 +1275,18 @@ struct ravg {
 	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
 	 * demand for tasks.
 	 *
-	 * 'flags' can have either or both of PREV_WINDOW_CONTRIB and
-	 * CURR_WINDOW_CONTRIB set.
+	 * 'curr_window' represents task's contribution to cpu busy time
+	 * statistics (rq->curr_runnable_sum) in current window
+	 *
+	 * 'prev_window' represents task's contribution to cpu busy time
+	 * statistics (rq->prev_runnable_sum) in previous window
 	 */
 	u64 mark_start;
-	u32 sum, demand, partial_demand, flags;
+	u32 sum, demand;
 	u32 sum_history[RAVG_HIST_SIZE_MAX];
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	u32 curr_window, prev_window;
+#endif
 };
 
 struct sched_entity {
@@ -1438,7 +1446,7 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 	struct ravg ravg;
 #endif
 #ifdef CONFIG_CGROUP_SCHED
@@ -2545,7 +2553,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
 #endif
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
-#if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_FREQ_INPUT)
+#ifdef CONFIG_SCHED_HMP
 extern void sched_exit(struct task_struct *p);
 #else
 static inline void sched_exit(struct task_struct *p) { }
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 411812931a26..a87dec5a4e10 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -44,8 +44,10 @@ extern unsigned int sysctl_sched_wakeup_load_threshold;
 extern unsigned int sysctl_sched_window_stats_policy;
 extern unsigned int sysctl_sched_account_wait_time;
 extern unsigned int sysctl_sched_ravg_hist_size;
-extern unsigned int sysctl_sched_freq_legacy_mode;
 extern unsigned int sysctl_sched_gov_response_time;
+extern unsigned int sysctl_sched_freq_account_wait_time;
+extern unsigned int sysctl_sched_migration_fixup;
+extern unsigned int sysctl_sched_heavy_task_pct;
 
 #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
 extern unsigned int sysctl_sched_init_task_load_pct;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 02ef9d73fd76..8829075d0746 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -72,7 +72,7 @@ TRACE_EVENT(sched_enq_deq_task,
 		__field(unsigned long,	cpu_load		)
 		__field(unsigned int,	rt_nr_running		)
 		__field(unsigned int,	cpus_allowed		)
-#ifdef CONFIG_SCHED_FREQ_INPUT
+#ifdef CONFIG_SCHED_HMP
 		__field(unsigned int,	sum_scaled		)
 		__field(unsigned int,	period			)
 		__field(unsigned int,	demand			)
@@ -89,7 +89,7 @@ TRACE_EVENT(sched_enq_deq_task,
 		__entry->cpu_load	= task_rq(p)->cpu_load[0];
 		__entry->rt_nr_running	= task_rq(p)->rt.rt_nr_running;
 		__entry->cpus_allowed	= cpus_allowed;
-#ifdef CONFIG_SCHED_FREQ_INPUT
+#ifdef CONFIG_SCHED_HMP
 		__entry->sum_scaled	= p->se.avg.runnable_avg_sum_scaled;
 		__entry->period		= p->se.avg.runnable_avg_period;
 		__entry->demand		= p->ravg.demand;
@@ -97,7 +97,7 @@ TRACE_EVENT(sched_enq_deq_task,
 	),
 
 	TP_printk("cpu=%d %s comm=%s pid=%d prio=%d nr_running=%u cpu_load=%lu rt_nr_running=%u affine=%x"
-#ifdef CONFIG_SCHED_FREQ_INPUT
+#ifdef CONFIG_SCHED_HMP
 		 " sum_scaled=%u period=%u demand=%u"
 #endif
 			, __entry->cpu,
@@ -105,7 +105,7 @@ TRACE_EVENT(sched_enq_deq_task,
 			__entry->comm, __entry->pid,
 			__entry->prio, __entry->nr_running,
 			__entry->cpu_load, __entry->rt_nr_running, __entry->cpus_allowed
-#ifdef CONFIG_SCHED_FREQ_INPUT
+#ifdef CONFIG_SCHED_HMP
 			, __entry->sum_scaled, __entry->period, __entry->demand
 #endif
 			)
@@ -213,10 +213,6 @@ TRACE_EVENT(sched_set_boost,
 	TP_printk("ref_count=%d", __entry->ref_count)
 );
 
-#endif	/* CONFIG_SCHED_HMP */
-
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
-
 TRACE_EVENT(sched_update_task_ravg,
 
 	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
@@ -229,8 +225,6 @@ TRACE_EVENT(sched_update_task_ravg,
 		__field(	pid_t,	pid			)
 		__field(	pid_t,	cur_pid			)
 		__field(unsigned int,	cur_freq		)
-		__field(unsigned int,	cs			)
-		__field(unsigned int,	ps			)
 		__field(	u64,	wallclock		)
 		__field(	u64,	mark_start		)
 		__field(	u64,	delta_m			)
@@ -239,9 +233,14 @@ TRACE_EVENT(sched_update_task_ravg,
 		__field(	u64,	irqtime			)
 		__field(enum task_event,	evt		)
 		__field(unsigned int,	demand			)
-		__field(unsigned int,	partial_demand		)
 		__field(unsigned int,	sum			)
 		__field(	 int,	cpu			)
+#ifdef CONFIG_SCHED_FREQ_INPUT
+		__field(	u64,	cs			)
+		__field(	u64,	ps			)
+		__field(	u32,	curr_window		)
+		__field(	u32,	prev_window		)
+#endif
 	),
 
 	TP_fast_assign(
@@ -252,43 +251,51 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->cpu            = rq->cpu;
 		__entry->cur_pid        = rq->curr->pid;
 		__entry->cur_freq       = rq->cur_freq;
-		__entry->cs             = rq->curr_runnable_sum;
-		__entry->ps             = rq->prev_runnable_sum;
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid            = p->pid;
 		__entry->mark_start     = p->ravg.mark_start;
 		__entry->delta_m        = (wallclock - p->ravg.mark_start);
 		__entry->demand         = p->ravg.demand;
-		__entry->partial_demand = p->ravg.partial_demand;
 		__entry->sum            = p->ravg.sum;
 		__entry->irqtime        = irqtime;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+		__entry->cs             = rq->curr_runnable_sum;
+		__entry->ps             = rq->prev_runnable_sum;
+		__entry->curr_window	= p->ravg.curr_window;
+		__entry->prev_window	= p->ravg.prev_window;
+#endif
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cs %u ps %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u partial_demand %u sum %u irqtime %llu",
-		__entry->wallclock, __entry->win_start, __entry->delta,
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+#ifdef CONFIG_SCHED_FREQ_INPUT
+		" cs %llu ps %llu cur_window %u prev_window %u"
+#endif
+		, __entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
-		__entry->cur_freq, __entry->cs, __entry->ps, __entry->cur_pid,
+		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
-		__entry->delta_m, __entry->demand, __entry->partial_demand,
-		__entry->sum, __entry->irqtime)
+		__entry->delta_m, __entry->demand,
+		__entry->sum, __entry->irqtime
+#ifdef CONFIG_SCHED_FREQ_INPUT
+		, __entry->cs, __entry->ps, __entry->curr_window,
+		  __entry->prev_window
+#endif
+		)
 );
 
 TRACE_EVENT(sched_update_history,
 
 	TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
-			int update_sum, int new_window, enum task_event evt),
+			enum task_event evt),
 
-	TP_ARGS(rq, p, runtime, samples, update_sum, new_window, evt),
+	TP_ARGS(rq, p, runtime, samples, evt),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,   TASK_COMM_LEN	)
 		__field(	pid_t,	pid			)
 		__field(unsigned int,	runtime			)
 		__field(	 int,	samples			)
-		__field(	 int,	update_sum		)
-		__field(	 int,	new_window		)
 		__field(enum task_event,	evt		)
-		__field(unsigned int,	partial_demand		)
 		__field(unsigned int,	demand			)
 		__array(	 u32,	hist, RAVG_HIST_SIZE_MAX)
 		__field(unsigned int,	nr_big_tasks		)
@@ -301,30 +308,60 @@ TRACE_EVENT(sched_update_history,
 		__entry->pid            = p->pid;
 		__entry->runtime        = runtime;
 		__entry->samples        = samples;
-		__entry->update_sum     = update_sum;
-		__entry->new_window     = new_window;
 		__entry->evt            = evt;
-		__entry->partial_demand = p->ravg.partial_demand;
 		__entry->demand         = p->ravg.demand;
 		memcpy(__entry->hist, p->ravg.sum_history,
 					RAVG_HIST_SIZE_MAX * sizeof(u32));
-#ifdef CONFIG_SCHED_HMP
 		__entry->nr_big_tasks   = rq->nr_big_tasks;
 		__entry->nr_small_tasks = rq->nr_small_tasks;
-#endif
 		__entry->cpu            = rq->cpu;
 	),
 
-	TP_printk("%d (%s): runtime %u samples %d us %d nw %d event %s partial_demand %u demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u nr_small %u",
+	TP_printk("%d (%s): runtime %u samples %d event %s demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u nr_small %u",
 		__entry->pid, __entry->comm,
-		__entry->runtime, __entry->samples, __entry->update_sum,
-		__entry->new_window, task_event_names[__entry->evt],
-		__entry->partial_demand, __entry->demand, __entry->hist[0],
+		__entry->runtime, __entry->samples,
+		task_event_names[__entry->evt],
+		__entry->demand, __entry->hist[0],
 		__entry->hist[1], __entry->hist[2], __entry->hist[3],
 		__entry->hist[4], __entry->cpu, __entry->nr_big_tasks,
 		__entry->nr_small_tasks)
 );
 
+TRACE_EVENT(sched_reset_all_window_stats,
+
+	TP_PROTO(u64 window_start, u64 window_size, u64 time_taken,
+		int reason, unsigned int old_val, unsigned int new_val),
+
+	TP_ARGS(window_start, window_size, time_taken,
+		reason, old_val, new_val),
+
+	TP_STRUCT__entry(
+		__field(	u64,	window_start		)
+		__field(	u64,	window_size		)
+		__field(	u64,	time_taken		)
+		__field(	int,	reason			)
+		__field(unsigned int,	old_val			)
+		__field(unsigned int,	new_val			)
+	),
+
+	TP_fast_assign(
+		__entry->window_start = window_start;
+		__entry->window_size = window_size;
+		__entry->time_taken = time_taken;
+		__entry->reason	= reason;
+		__entry->old_val = old_val;
+		__entry->new_val = new_val;
+	),
+
+	TP_printk("time_taken %llu window_start %llu window_size %llu reason %s old_val %u new_val %u",
+		  __entry->time_taken, __entry->window_start,
+		  __entry->window_size,
+		  sched_window_reset_reasons[__entry->reason],
+		  __entry->old_val, __entry->new_val)
+);
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
 TRACE_EVENT(sched_migration_update_sum,
 
 	TP_PROTO(struct rq *rq, struct task_struct *p),
@@ -333,9 +370,9 @@ TRACE_EVENT(sched_migration_update_sum,
 
 	TP_STRUCT__entry(
 		__field(int,		cpu			)
-		__field(int,		cs			)
-		__field(int,		ps			)
 		__field(int,		pid			)
+		__field(	u64,	cs			)
+		__field(	u64,	ps			)
 	),
 
 	TP_fast_assign(
@@ -345,12 +382,10 @@ TRACE_EVENT(sched_migration_update_sum,
 		__entry->pid		= p->pid;
 	),
 
-	TP_printk("cpu %d: cs %u ps %u pid %d", __entry->cpu,
+	TP_printk("cpu %d: cs %llu ps %llu pid %d", __entry->cpu,
 		      __entry->cs, __entry->ps, __entry->pid)
 );
 
-#ifdef CONFIG_SCHED_FREQ_INPUT
-
 TRACE_EVENT(sched_get_busy,
 
 	TP_PROTO(int cpu, u64 load),
@@ -395,7 +430,7 @@ TRACE_EVENT(sched_freq_alert,
 
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
-#endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#endif	/* CONFIG_SCHED_HMP */
 
 /*
  * Tracepoint for waking up a task:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index def99b4ba90a..95c7b64bef93 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1160,13 +1160,13 @@ static inline void clear_hmp_request(int cpu) { }
 
 #endif	/* CONFIG_SCHED_HMP */
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#if defined(CONFIG_SCHED_HMP)
 
 /*
- * sched_window_stats_policy, sched_account_wait_time, sched_ravg_hist_size
- * and sched_freq_legacy_mode have a 'sysctl' copy associated with them. This
- * is required for atomic update of those variables when being modifed via
- * sysctl interface.
+ * sched_window_stats_policy, sched_account_wait_time, sched_ravg_hist_size,
+ * sched_migration_fixup, sched_freq_account_wait_time have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
  *
  * IMPORTANT: Initialize both copies to same value!!
  */
@@ -1174,7 +1174,6 @@ static inline void clear_hmp_request(int cpu) { }
 static __read_mostly unsigned int sched_ravg_hist_size = 5;
 __read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
 
-
 static __read_mostly unsigned int sched_window_stats_policy =
 	 WINDOW_STATS_MAX_RECENT_AVG;
 __read_mostly unsigned int sysctl_sched_window_stats_policy =
@@ -1183,39 +1182,33 @@ __read_mostly unsigned int sysctl_sched_window_stats_policy =
 static __read_mostly unsigned int sched_account_wait_time = 1;
 __read_mostly unsigned int sysctl_sched_account_wait_time = 1;
 
-static __read_mostly unsigned int sched_freq_legacy_mode;
-__read_mostly unsigned int sysctl_sched_freq_legacy_mode;
+#ifdef CONFIG_SCHED_FREQ_INPUT
 
+static __read_mostly unsigned int sched_migration_fixup = 1;
+__read_mostly unsigned int sysctl_sched_migration_fixup = 1;
 
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
+static __read_mostly unsigned int sched_freq_account_wait_time;
+__read_mostly unsigned int sysctl_sched_freq_account_wait_time;
 
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
+/*
+ * Force-issue notification to governor if we waited long enough since sending
+ * last notification and did not see any freq change.
+ */
+__read_mostly unsigned int sysctl_sched_gov_response_time = 10000000;
 
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+__read_mostly int sysctl_sched_freq_inc_notify_slack_pct = -INT_MAX;
+__read_mostly int sysctl_sched_freq_dec_notify_slack_pct = INT_MAX;
+
+static __read_mostly unsigned int sched_io_is_busy;
+
+#endif	/* CONFIG_SCHED_FREQ_INPUT */
 
 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
 unsigned int __read_mostly sched_use_pelt;
 
-/* Temporarily disable window-stats activity on all cpus */
-unsigned int __read_mostly sched_disable_window_stats;
-
 unsigned int max_possible_efficiency = 1024;
 unsigned int min_possible_efficiency = 1024;
 
-__read_mostly int sysctl_sched_freq_inc_notify_slack_pct = -INT_MAX;
-__read_mostly int sysctl_sched_freq_dec_notify_slack_pct = INT_MAX;
-
-static __read_mostly unsigned int sched_io_is_busy;
-
-/*
- * Force-issue notification to governor if we waited long enough since sending
- * last notification and did not see any freq change.
- */
-__read_mostly unsigned int sysctl_sched_gov_response_time = 10000000;
-
 /*
  * Maximum possible frequency across all cpus. Task demand and cpu
  * capacity (cpu_power) metrics are scaled in reference to it.
@@ -1234,16 +1227,84 @@ unsigned int max_capacity = 1024; /* max(rq->capacity) */
 unsigned int min_capacity = 1024; /* min(rq->capacity) */
 unsigned int max_load_scale_factor = 1024; /* max(rq->load_scale_factor) */
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
+
 static unsigned int sync_cpu;
 static u64 sched_init_jiffy;
 static u64 sched_clock_at_init_jiffy;
 
-#define CURR_WINDOW_CONTRIB	1
-#define PREV_WINDOW_CONTRIB	2
-#define DONT_ACCOUNT		4
+#define EXITING_TASK_MARKER	0xdeaddead
+
+static inline int exiting_task(struct task_struct *p)
+{
+	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static int __init set_sched_ravg_window(char *str)
+{
+	get_option(&str, &sched_ravg_window);
+
+	sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+				sched_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+static inline void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+	s64 delta;
+	int nr_windows;
+
+	delta = wallclock - rq->window_start;
+	BUG_ON(delta < 0);
+	if (delta < sched_ravg_window)
+		return;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+}
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	unsigned int cur_freq = rq->cur_freq;
+	int sf;
+
+	if (unlikely(cur_freq > max_possible_freq ||
+		     (cur_freq == rq->max_freq &&
+		      rq->max_freq < rq->max_possible_freq)))
+		cur_freq = rq->max_possible_freq;
+
+	delta = div64_u64(delta  * cur_freq, max_possible_freq);
+	sf = (rq->efficiency * 1024) / max_possible_efficiency;
+	delta *= sf;
+	delta >>= 10;
+
+	return delta;
+}
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
 
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!sched_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
 /* Does freq_required sufficiently exceed or fall behind cur_freq? */
 static inline int
 nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
@@ -1355,64 +1416,318 @@ void check_for_freq_change(struct rq *rq)
 		(void *)(long)max_demand_cpu);
 }
 
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+					 event == TASK_UPDATE)
+		return 1;
+
+	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+	return sched_freq_account_wait_time;
+}
+
+static inline int
+heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
+{
+	u32 task_demand = p->ravg.demand;
+
+	if (!sched_heavy_task || event != TASK_WAKE ||
+	    task_demand < sched_heavy_task || exiting_task(p))
+		return 0;
+
+	if (p->ravg.mark_start > rq->window_start)
+		return 0;
+
+	/* has a full window elapsed since task slept? */
+	return (rq->window_start - p->ravg.mark_start > sched_ravg_window);
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, nr_full_windows = 0;
+	int p_is_curr_task = (p == rq->curr);
+	u64 mark_start = p->ravg.mark_start;
+	u64 window_start = rq->window_start;
+	u32 window_size = sched_ravg_window;
+	u64 delta;
+
+	new_window = mark_start < window_start;
+	if (new_window)
+		nr_full_windows = div64_u64((window_start - mark_start),
+						window_size);
+
+	/* Handle per-task window rollover. We don't care about the idle
+	 * task or exiting tasks. */
+	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+		u32 curr_window = 0;
+
+		if (!nr_full_windows)
+			curr_window = p->ravg.curr_window;
+
+		p->ravg.prev_window = curr_window;
+		p->ravg.curr_window = 0;
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+		/* account_busy_for_cpu_time() = 0, so no update to the
+		 * task's current window needs to be made. This could be
+		 * for example
+		 *
+		 *   - a wakeup event on a task within the current
+		 *     window (!new_window below, no action required),
+		 *   - switching to a new task from idle (PICK_NEXT_TASK)
+		 *     in a new window where irqtime is 0 and we aren't
+		 *     waiting on IO */
+
+		if (!new_window)
+			return;
+
+		/* A new window has started. The RQ demand must be rolled
+		 * over if p is the current task. */
+		if (p_is_curr_task) {
+			u64 prev_sum = 0;
+
+			/* p is either idle task or an exiting task */
+			if (!nr_full_windows)
+				prev_sum = rq->curr_runnable_sum;
+
+			rq->prev_runnable_sum = prev_sum;
+			rq->curr_runnable_sum = 0;
+
+		} else if (heavy_task_wakeup(p, rq, event)) {
+			/* A new window has started. If p is a waking
+			 * heavy task its prev_window contribution is faked
+			 * to be its window-based demand. Note that this can
+			 * introduce phantom load into the system depending
+			 * on the window policy and task behavior. This feature
+			 * can be controlled via the sched_heavy_task
+			 * tunable. */
+			p->ravg.prev_window = p->ravg.demand;
+			rq->prev_runnable_sum += p->ravg.demand;
+		}
+
+		return;
+	}
+
+	if (!new_window) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. No rollover
+		 * since we didn't start a new window. An example of this is
+		 * when a task starts execution and then sleeps within the
+		 * same window. */
+
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		rq->curr_runnable_sum += delta;
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window += delta;
+
+		return;
+	}
+
+	if (!p_is_curr_task) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has also started, but p is not the current task, so the
+		 * window is not rolled over - just split up and account
+		 * as necessary into curr and prev. The window is only
+		 * rolled over when a new window is processed for the current
+		 * task.
+		 *
+		 * Irqtime can't be accounted by a task that isn't the
+		 * currently running task. */
+
+		if (!nr_full_windows) {
+			/* A full window hasn't elapsed, account partial
+			 * contribution to previous completed window. */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window += delta;
+		} else {
+			/* Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size). */
+			delta = scale_exec_time(window_size, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window = delta;
+		}
+		rq->prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		rq->curr_runnable_sum += delta;
+		if (!exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. If any of these three above conditions are true
+		 * then this busy time can't be accounted as irqtime.
+		 *
+		 * Busy time for the idle task or exiting tasks need not
+		 * be accounted.
+		 *
+		 * An example of this would be a task that starts execution
+		 * and then sleeps once a new window has begun. */
+
+		if (!nr_full_windows) {
+			/* A full window hasn't elapsed, account partial
+			 * contribution to previous completed window. */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window += delta;
+			delta += rq->curr_runnable_sum;
+		} else {
+			/* Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size). */
+			delta = scale_exec_time(window_size, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window = delta;
+		}
+		/* Rollover is done here by overwriting the values in
+		 * prev_runnable_sum and curr_runnable_sum. */
+		rq->prev_runnable_sum = delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		rq->curr_runnable_sum = delta;
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (irqtime) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. The current task must be the idle task because
+		 * irqtime is not accounted for any other task.
+		 *
+		 * Irqtime will be accounted each time we process IRQ activity
+		 * after a period of idleness, so we know the IRQ busy time
+		 * started at wallclock - irqtime. */
+
+		BUG_ON(!is_idle_task(p));
+		mark_start = wallclock - irqtime;
+
+		/* Roll window over. If IRQ busy time was just in the current
+		 * window then that is all that need be accounted. */
+		rq->prev_runnable_sum = rq->curr_runnable_sum;
+		if (mark_start > window_start) {
+			rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/* The IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window start first. */
+		delta = window_start - mark_start;
+		if (delta > window_size) {
+			delta = window_size;
+			/* If there's 1 or more full windows of IRQ busy time
+			 * then the entire prev_runnable_sum will be a window
+			 * of IRQ time - there should be no contribution from
+			 * anything else. */
+			rq->prev_runnable_sum = 0;
+		}
+		delta = scale_exec_time(delta, rq);
+		rq->prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+		return;
+	}
+
+	BUG();
+}
+#else	/* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock, u64 irqtime)
+{
+}
+
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+	/* No need to bother updating task demand for exiting tasks
+	 * or the idle task. */
+	if (exiting_task(p) || is_idle_task(p))
+		return 0;
+
+	/* When a task is waking up it is completing a segment of non-busy
+	 * time. Likewise, if wait time is not treated as busy time, then
+	 * when a task begins to run or is migrated, it is not running and
+	 * is completing a segment of non-busy time. */
+	if (event == TASK_WAKE || (!sched_account_wait_time &&
+			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		return 0;
+
+	return 1;
+}
+
 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
  * when, say, a real-time task runs without preemption for several windows at a
  * stretch.
  */
-static inline void
-update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
-		 int update_sum, int new_window, int event)
+static void update_history(struct rq *rq, struct task_struct *p,
+			 u32 runtime, int samples, int event)
 {
 	u32 *hist = &p->ravg.sum_history[0];
 	int ridx, widx;
 	u32 max = 0, avg, demand;
 	u64 sum = 0;
 
-	if (new_window) {
-		p->ravg.flags &= ~(CURR_WINDOW_CONTRIB | PREV_WINDOW_CONTRIB);
-		if (runtime)
-			p->ravg.flags |= PREV_WINDOW_CONTRIB;
-	}
-
 	/* Ignore windows where task had no activity */
-	if (!runtime && !update_sum)
-		return;
-
-	if (!new_window) {
-		for (ridx = 0; ridx < sched_ravg_hist_size - 1; ++ridx) {
-			sum += hist[ridx];
-			if (hist[ridx] > max)
-				max = hist[ridx];
-		}
-		sum += runtime;
-		if (runtime > max)
-			max = runtime;
-		goto compute_demand;
-	}
+	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+			goto done;
 
 	/* Push new 'runtime' value onto stack */
-	widx = RAVG_HIST_SIZE_MAX - 1;
+	widx = sched_ravg_hist_size - 1;
 	ridx = widx - samples;
 	for (; ridx >= 0; --widx, --ridx) {
 		hist[widx] = hist[ridx];
-		if (widx < sched_ravg_hist_size) {
-			sum += hist[widx];
-			if (hist[widx] > max)
-				max = hist[widx];
-		}
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
 	}
 
-	for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE_MAX; widx++) {
+	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
 		hist[widx] = runtime;
-		if (widx < sched_ravg_hist_size) {
-			sum += hist[widx];
-			if (hist[widx] > max)
-				max = hist[widx];
-		}
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
 	}
 
 	p->ravg.sum = 0;
@@ -1423,7 +1738,6 @@ update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
 			dec_nr_big_small_task(rq, p);
 	}
 
-compute_demand:
 	avg = div64_u64(sum, sched_ravg_hist_size);
 
 	if (sched_window_stats_policy == WINDOW_STATS_RECENT)
@@ -1435,256 +1749,145 @@ compute_demand:
 	else
 		demand = max(avg, runtime);
 
-	if (new_window)
-		p->ravg.demand = demand;
-
-	if (!sched_freq_legacy_mode && update_sum &&
-			 (p->ravg.flags & CURR_WINDOW_CONTRIB)) {
-		rq->curr_runnable_sum -= p->ravg.partial_demand;
-		BUG_ON((s64)rq->curr_runnable_sum < 0);
-	}
-
-	p->ravg.partial_demand = demand;
-
-	if (!sched_freq_legacy_mode && update_sum && !new_window) {
-		rq->curr_runnable_sum += p->ravg.partial_demand;
-		p->ravg.flags |= CURR_WINDOW_CONTRIB;
-	}
-
-	if (!new_window)
-		return;
+	p->ravg.demand = demand;
 
 	if (p->on_rq) {
 		rq->cumulative_runnable_avg += p->ravg.demand;
 		if (p->sched_class == &fair_sched_class)
 			inc_nr_big_small_task(rq, p);
 	}
-	trace_sched_update_history(rq, p, runtime, samples, update_sum,
-				   new_window, event);
-}
 
-static int __init set_sched_ravg_window(char *str)
-{
-	get_option(&str, &sched_ravg_window);
-
-	sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW ||
-				sched_ravg_window > MAX_SCHED_RAVG_WINDOW);
-
-	return 0;
+done:
+	trace_sched_update_history(rq, p, runtime, samples, event);
 }
 
-early_param("sched_ravg_window", set_sched_ravg_window);
-
-static inline int cpu_is_waiting_on_io(struct rq *rq)
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+				u64 delta)
 {
-	if (!sched_io_is_busy)
-		return 0;
-
-	return atomic_read(&rq->nr_iowait);
+	delta = scale_exec_time(delta, rq);
+	p->ravg.sum += delta;
+	if (unlikely(p->ravg.sum > sched_ravg_window))
+		p->ravg.sum = sched_ravg_window;
 }
 
-static inline void
-move_window_start(struct rq *rq, u64 wallclock, int update_sum,
-						 struct task_struct *p)
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *	a) Task event is contained within one window.
+ *		window_start < mark_start < wallclock
+ *
+ *		ws   ms  wc
+ *		|    |   |
+ *		V    V   V
+ *		|---------------|
+ *
+ *	In this case, p->ravg.sum is updated *iff* event is appropriate
+ *	(ex: event == PUT_PREV_TASK)
+ *
+ *	b) Task event spans two windows.
+ *		mark_start < window_start < wallclock
+ *
+ *		ms   ws   wc
+ *		|    |    |
+ *		V    V    V
+ *		-----|-------------------
+ *
+ *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *	is appropriate, then a new window sample is recorded followed
+ *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *	c) Task event spans more than two windows.
+ *
+ *		ms ws_tmp			   ws  wc
+ *		|  |				   |   |
+ *		V  V				   V   V
+ *		---|-------|-------|-------|-------|------
+ *		   |				   |
+ *		   |<------ nr_full_windows ------>|
+ *
+ *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *	event is appropriate, window sample of p->ravg.sum is recorded,
+ *	'nr_full_window' samples of window_size is also recorded *iff*
+ *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *	*iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock)
 {
-	s64 delta;
-	int nr_windows;
+	u64 mark_start = p->ravg.mark_start;
+	u64 delta, window_start = rq->window_start;
+	int new_window, nr_full_windows;
+	u32 window_size = sched_ravg_window;
 
-	delta = wallclock - rq->window_start;
-	BUG_ON(delta < 0);
-	if (delta < sched_ravg_window)
+	new_window = mark_start < window_start;
+	if (!account_busy_for_task_demand(p, event)) {
+		if (new_window)
+			/* If the time accounted isn't being accounted as
+			 * busy time, and a new window started, only the
+			 * previous window need be closed out with the
+			 * pre-existing demand. Multiple windows may have
+			 * elapsed, but since empty windows are dropped,
+			 * it is not necessary to account those. */
+			update_history(rq, p, p->ravg.sum, 1, event);
 		return;
+	}
 
-	nr_windows = div64_u64(delta, sched_ravg_window);
-	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
-
-	if (is_idle_task(rq->curr) && !cpu_is_waiting_on_io(rq)) {
-		if (nr_windows == 1)
-			rq->prev_runnable_sum = rq->curr_runnable_sum;
-		else
-			rq->prev_runnable_sum = 0;
-
-		rq->curr_runnable_sum = 0;
+	if (!new_window) {
+		/* The simple case - busy time contained within the existing
+		 * window. */
+		add_to_task_demand(rq, p, wallclock - mark_start);
+		return;
 	}
-}
 
-static inline u64 scale_exec_time(u64 delta, struct rq *rq)
-{
-	unsigned int cur_freq = rq->cur_freq;
-	int sf;
+	/* Busy time spans at least two windows. Temporarily rewind
+	 * window_start to first window boundary after mark_start. */
+	delta = window_start - mark_start;
+	nr_full_windows = div64_u64(delta, window_size);
+	window_start -= (u64)nr_full_windows * (u64)window_size;
 
-	if (unlikely(cur_freq > max_possible_freq ||
-		     (cur_freq == rq->max_freq &&
-		      rq->max_freq < rq->max_possible_freq)))
-		cur_freq = rq->max_possible_freq;
+	/* Process (window_start - mark_start) first */
+	add_to_task_demand(rq, p, window_start - mark_start);
 
-	delta = div64_u64(delta  * cur_freq, max_possible_freq);
-	sf = (rq->efficiency * 1024) / max_possible_efficiency;
-	delta *= sf;
-	delta >>= 10;
+	/* Push new sample(s) into task's demand history */
+	update_history(rq, p, p->ravg.sum, 1, event);
+	if (nr_full_windows)
+		update_history(rq, p, scale_exec_time(window_size, rq),
+			       nr_full_windows, event);
 
-	return delta;
+	/* Roll window_start back to current to process any remainder
+	 * in current window. */
+	window_start += (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (wallclock - window_start) next */
+	mark_start = window_start;
+	add_to_task_demand(rq, p, wallclock - mark_start);
 }
 
+/* Reflect task activity on its demand and cpu's busy time statistics */
 static void update_task_ravg(struct task_struct *p, struct rq *rq,
 	     int event, u64 wallclock, u64 irqtime)
 {
-	u32 window_size = sched_ravg_window;
-	int update_sum, new_window;
-	u64 mark_start = p->ravg.mark_start;
-	u64 window_start;
-	s64 delta = 0;
-
 	if (sched_use_pelt || !rq->window_start || sched_disable_window_stats)
 		return;
 
 	lockdep_assert_held(&rq->lock);
 
-	if (!p->ravg.mark_start)
-		goto done;
-
-	update_sum = (event == PUT_PREV_TASK || event == TASK_UPDATE ||
-			(sched_account_wait_time &&
-			(event == PICK_NEXT_TASK || event == TASK_MIGRATE)));
-
-	move_window_start(rq, wallclock, update_sum, p);
-	window_start = rq->window_start;
-
-	if (is_idle_task(p)) {
-		if (!irqtime && !(event == PUT_PREV_TASK &&
-					cpu_is_waiting_on_io(rq)))
-			goto done;
-
-		if (irqtime && !cpu_is_waiting_on_io(rq))
-			mark_start = wallclock - irqtime;
-
-		if (window_start > mark_start) {
-			delta = window_start - mark_start;
-			if (delta > window_size) {
-				rq->curr_runnable_sum = 0;
-				delta = window_size;
-			}
-			delta = scale_exec_time(delta, rq);
-			rq->curr_runnable_sum += delta;
-			rq->prev_runnable_sum = rq->curr_runnable_sum;
-			rq->curr_runnable_sum = 0;
-			mark_start = window_start;
-		}
-		delta = wallclock - mark_start;
-		delta = scale_exec_time(delta, rq);
-		rq->curr_runnable_sum += delta;
+	update_window_start(rq, wallclock);
 
+	if (!p->ravg.mark_start)
 		goto done;
-	}
 
-	do {
-		int nr_full_windows = 0;
-		u64 now = wallclock;
-		u32 sum = 0;
-		u32 partial_demand = p->ravg.partial_demand;
-
-		new_window = 0;
-
-		if (window_start > mark_start) {
-			delta = window_start - mark_start;
-			nr_full_windows = div64_u64(delta, window_size);
-			window_start -= nr_full_windows * window_size;
-			now = window_start;
-			new_window = 1;
-		}
-
-		/*
-		 * Tasks marked as DONT_ACCOUNT will not be accounted in
-		 * rq->prev/curr_runnable_sum. We however want to perform
-		 * maintenance duties on other counters such as window_start
-		 * and roll over of curr_runnable_sum into prev_runnable_sum
-		 * when update_task_ravg() is called on such tasks.
-		 */
-		if (update_sum && !(p->ravg.flags & DONT_ACCOUNT)) {
-			delta = now - mark_start;
-			delta = scale_exec_time(delta, rq);
-			BUG_ON(delta < 0);
-
-			p->ravg.sum += delta;
-			if (sched_freq_legacy_mode && (event == PUT_PREV_TASK ||
-							 event == TASK_UPDATE))
-				rq->curr_runnable_sum += delta;
-
-			if (unlikely(p->ravg.sum > window_size))
-				p->ravg.sum = window_size;
-		}
-
-		update_history(rq, p, p->ravg.sum, 1, update_sum,
-							new_window, event);
-		if (!new_window)
-			break;
-
-		if (nr_full_windows) {
-			window_start += nr_full_windows * window_size;
-			if (update_sum && !(p->ravg.flags & DONT_ACCOUNT))
-				sum = window_size;
-			sum = scale_exec_time(sum, rq);
-			update_history(rq, p, sum, nr_full_windows,
-					update_sum, new_window, event);
-		}
-
-		if (update_sum) {
-			if (event == PUT_PREV_TASK || event == TASK_UPDATE) {
-				if (sched_freq_legacy_mode) {
-					if (nr_full_windows) {
-						/* sum == scaled window_size */
-						rq->curr_runnable_sum = sum;
-					}
-					rq->prev_runnable_sum =
-						rq->curr_runnable_sum;
-					rq->curr_runnable_sum = 0;
-					mark_start = window_start;
-					continue;
-				}
-
-				if (!nr_full_windows) {
-					rq->curr_runnable_sum -= partial_demand;
-					rq->curr_runnable_sum += p->ravg.demand;
-					rq->prev_runnable_sum =
-							 rq->curr_runnable_sum;
-				} else {
-					rq->prev_runnable_sum = p->ravg.demand;
-				}
-				rq->curr_runnable_sum = p->ravg.partial_demand;
-				p->ravg.flags |= CURR_WINDOW_CONTRIB;
-			} else if (!sched_freq_legacy_mode) {
-				if (!nr_full_windows) {
-					rq->prev_runnable_sum -= partial_demand;
-					BUG_ON((s64)rq->prev_runnable_sum < 0);
-				}
-				rq->prev_runnable_sum += p->ravg.demand;
-				rq->curr_runnable_sum += p->ravg.partial_demand;
-				p->ravg.flags |= CURR_WINDOW_CONTRIB;
-				p->ravg.flags |= PREV_WINDOW_CONTRIB;
-				/* Todo: check if freq change is needed */
-			}
-		}
-
-		mark_start = window_start;
-	} while (new_window);
-
-	/*
-	 * We depend on task's partial_demand to be always represented in
-	 * rq->curr_runnable_sum and its demand to be represented in
-	 * rq->prev_runnable_sum. When task wakes up (TASK_WAKE) or is picked to
-	 * run (PICK_NEXT_TASK) or migrated (TASK_MIGRATE) with
-	 * sched_account_wait_time == 0, ensure this dependency is met.
-	 */
-
-	if (!sched_freq_legacy_mode && !(p->ravg.flags & CURR_WINDOW_CONTRIB)) {
-		rq->curr_runnable_sum += p->ravg.partial_demand;
-		p->ravg.flags |= CURR_WINDOW_CONTRIB;
-	}
-
-	if (!sched_freq_legacy_mode && !(p->ravg.flags & PREV_WINDOW_CONTRIB)) {
-		rq->prev_runnable_sum += p->ravg.demand;
-		p->ravg.flags |= PREV_WINDOW_CONTRIB;
-	}
+	update_task_demand(p, rq, event, wallclock);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
 
 done:
 	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime);
@@ -1737,15 +1940,14 @@ static void init_cpu_efficiency(void)
 
 static void reset_task_stats(struct task_struct *p)
 {
-	int i;
+	u32 sum = 0;
 
-	p->ravg.sum = 0;
-	p->ravg.demand = 0;
-	p->ravg.partial_demand = 0;
-	p->ravg.flags &= ~(CURR_WINDOW_CONTRIB | PREV_WINDOW_CONTRIB);
-	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
-		p->ravg.sum_history[i] = 0;
-	p->ravg.mark_start = 0;
+	if (exiting_task(p))
+		sum = EXITING_TASK_MARKER;
+
+	memset(&p->ravg, 0, sizeof(struct ravg));
+	/* Retain EXITING_TASK marker */
+	p->ravg.sum_history[0] = sum;
 }
 
 static inline void mark_task_starting(struct task_struct *p)
@@ -1759,14 +1961,6 @@ static inline void mark_task_starting(struct task_struct *p)
 	}
 
 	p->ravg.mark_start = wallclock;
-	if (sched_freq_legacy_mode)
-		return;
-
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-	rq->prev_runnable_sum += p->ravg.demand;
-	rq->curr_runnable_sum += p->ravg.partial_demand;
-	p->ravg.flags |= CURR_WINDOW_CONTRIB;
-	p->ravg.flags |= PREV_WINDOW_CONTRIB;
 }
 
 static int update_alignment;
@@ -1790,7 +1984,9 @@ static inline void set_window_start(struct rq *rq)
 		raw_spin_unlock(&rq->lock);
 		double_rq_lock(rq, sync_rq);
 		rq->window_start = cpu_rq(sync_cpu)->window_start;
+#ifdef CONFIG_SCHED_FREQ_INPUT
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+#endif
 		raw_spin_unlock(&sync_rq->lock);
 	}
 
@@ -1815,12 +2011,9 @@ static void reset_all_task_stats(void)
 }
 
 /*
- * sched_exit() - Set DONT_ACCOUNT bit in task's ravg.flags
+ * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
  *
- * This will remove an exiting task's stats from cpu busy counters
- * (rq->curr/prev_runnable_sum) and also reset its stats. DONT_ACCOUNT bit is
- * also set in exiting tasks ravg.flags so that its future usage of cpu is
- * discounted from cpu busy time.
+ * Stop accounting (exiting) task's future cpu usage
  *
  * We need this so that reset_all_windows_stats() can function correctly.
  * reset_all_window_stats() depends on do_each_thread/for_each_thread task
@@ -1841,17 +2034,9 @@ void sched_exit(struct task_struct *p)
 	wallclock = sched_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	dequeue_task(rq, p, 0);
-	if (!sched_disable_window_stats && !sched_freq_legacy_mode &&
-			(p->ravg.flags & CURR_WINDOW_CONTRIB))
-		rq->curr_runnable_sum -= p->ravg.partial_demand;
-	if (!sched_disable_window_stats && !sched_freq_legacy_mode &&
-			(p->ravg.flags & PREV_WINDOW_CONTRIB))
-		rq->prev_runnable_sum -= p->ravg.demand;
-	BUG_ON((s64)rq->curr_runnable_sum < 0);
-	BUG_ON((s64)rq->prev_runnable_sum < 0);
 	reset_task_stats(p);
 	p->ravg.mark_start = wallclock;
-	p->ravg.flags |= DONT_ACCOUNT;
+	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
 	enqueue_task(rq, p, 0);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -1882,11 +2067,31 @@ static void enable_window_stats(void)
 
 }
 
+enum reset_reason_code {
+	WINDOW_CHANGE,
+	POLICY_CHANGE,
+	ACCOUNT_WAIT_TIME_CHANGE,
+	HIST_SIZE_CHANGE,
+	MIGRATION_FIXUP_CHANGE,
+	FREQ_ACCOUNT_WAIT_TIME_CHANGE
+};
+
+const char *sched_window_reset_reasons[] = {
+	"WINDOW_CHANGE",
+	"POLICY_CHANGE",
+	"ACCOUNT_WAIT_TIME_CHANGE",
+	"HIST_SIZE_CHANGE",
+	"MIGRATION_FIXUP_CHANGE",
+	"FREQ_ACCOUNT_WAIT_TIME_CHANGE"};
+
 /* Called with IRQs enabled */
 void reset_all_window_stats(u64 window_start, unsigned int window_size)
 {
 	int cpu;
 	unsigned long flags;
+	u64 start_ts = sched_clock();
+	int reason = WINDOW_CHANGE;
+	unsigned int old = 0, new = 0;
 
 	disable_window_stats();
 
@@ -1911,15 +2116,44 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 		if (window_start)
 			rq->window_start = window_start;
+#ifdef CONFIG_SCHED_FREQ_INPUT
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+#endif
 		rq->cumulative_runnable_avg = 0;
 		fixup_nr_big_small_task(cpu);
 	}
 
-	sched_window_stats_policy = sysctl_sched_window_stats_policy;
-	sched_account_wait_time = sysctl_sched_account_wait_time;
-	sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
-	sched_freq_legacy_mode = sysctl_sched_freq_legacy_mode;
+	if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
+		reason = POLICY_CHANGE;
+		old = sched_window_stats_policy;
+		new = sysctl_sched_window_stats_policy;
+		sched_window_stats_policy = sysctl_sched_window_stats_policy;
+	} else if (sched_account_wait_time != sysctl_sched_account_wait_time) {
+		reason = ACCOUNT_WAIT_TIME_CHANGE;
+		old = sched_account_wait_time;
+		new = sysctl_sched_account_wait_time;
+		sched_account_wait_time = sysctl_sched_account_wait_time;
+	} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
+		reason = HIST_SIZE_CHANGE;
+		old = sched_ravg_hist_size;
+		new = sysctl_sched_ravg_hist_size;
+		sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
+	}
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	else if (sched_migration_fixup != sysctl_sched_migration_fixup) {
+		reason = MIGRATION_FIXUP_CHANGE;
+		old = sched_migration_fixup;
+		new = sysctl_sched_migration_fixup;
+		sched_migration_fixup = sysctl_sched_migration_fixup;
+	} else if (sched_freq_account_wait_time !=
+					sysctl_sched_freq_account_wait_time) {
+		reason = FREQ_ACCOUNT_WAIT_TIME_CHANGE;
+		old = sched_freq_account_wait_time;
+		new = sysctl_sched_freq_account_wait_time;
+		sched_freq_account_wait_time =
+				 sysctl_sched_freq_account_wait_time;
+	}
+#endif
 
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
@@ -1927,6 +2161,9 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	}
 
 	local_irq_restore(flags);
+
+	trace_sched_reset_all_window_stats(window_start, window_size,
+		sched_clock() - start_ts, reason, old, new);
 }
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
@@ -1993,6 +2230,89 @@ int sched_set_window(u64 window_start, unsigned int window_size)
 
 	return 0;
 }
+
+static void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+
+	if (!sched_enable_hmp || !sched_migration_fixup ||
+		 exiting_task(p) || (!p->on_rq && p->state != TASK_WAKING))
+			return;
+
+	if (p->state == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+
+	if (sched_disable_window_stats)
+		goto done;
+
+	wallclock = sched_clock();
+
+	update_task_ravg(task_rq(p)->curr, task_rq(p),
+			 TASK_UPDATE,
+			 wallclock, 0);
+	update_task_ravg(dest_rq->curr, dest_rq,
+			 TASK_UPDATE, wallclock, 0);
+
+	/*
+	 * In case of migration of task on runqueue, on_rq =1,
+	 * however its load is removed from its runqueue.
+	 * update_task_ravg() below can update its demand, which
+	 * will require its load on runqueue to be adjusted to
+	 * reflect new demand. Restore load temporarily for such
+	 * task on its runqueue
+	 */
+	if (p->on_rq) {
+		inc_cumulative_runnable_avg(src_rq, p);
+		if (p->sched_class == &fair_sched_class)
+			inc_nr_big_small_task(src_rq, p);
+	}
+
+	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+			 wallclock, 0);
+
+	/*
+	 * Remove task's load from rq as its now migrating to
+	 * another cpu.
+	 */
+	if (p->on_rq) {
+		dec_cumulative_runnable_avg(src_rq, p);
+		if (p->sched_class == &fair_sched_class)
+			dec_nr_big_small_task(src_rq, p);
+	}
+
+	if (p->ravg.curr_window) {
+		src_rq->curr_runnable_sum -= p->ravg.curr_window;
+		dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	}
+
+	if (p->ravg.prev_window) {
+		src_rq->prev_runnable_sum -= p->ravg.prev_window;
+		dest_rq->prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+
+	trace_sched_migration_update_sum(src_rq, p);
+	trace_sched_migration_update_sum(dest_rq, p);
+
+done:
+	if (p->state == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+}
+
+#else
+
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+
+static inline int
+heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
 /* Keep track of max/min capacity possible across CPUs "currently" */
@@ -2187,6 +2507,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	cpu_rq(cpu)->cur_freq = new_freq;
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
+#ifdef CONFIG_SCHED_FREQ_INPUT
 	/* clear freq request for CPUs in the same freq domain */
 	if (!rq->freq_requested)
 		return 0;
@@ -2203,6 +2524,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	for_each_cpu(cpu, &rq->freq_domain_cpumask)
 		cpu_rq(cpu)->freq_requested = 0;
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
+#endif
 
 	return 0;
 }
@@ -2240,87 +2562,12 @@ static int register_sched_callback(void)
  */
 core_initcall(register_sched_callback);
 
-static void fixup_busy_time(struct task_struct *p, int new_cpu)
-{
-	struct rq *src_rq = task_rq(p);
-	struct rq *dest_rq = cpu_rq(new_cpu);
-	u64 wallclock;
-
-	if (p->state == TASK_WAKING)
-		double_rq_lock(src_rq, dest_rq);
-
-	if (sched_disable_window_stats)
-		goto done;
-
-	wallclock = sched_clock();
-
-	update_task_ravg(task_rq(p)->curr, task_rq(p),
-			 TASK_UPDATE,
-			 wallclock, 0);
-	update_task_ravg(dest_rq->curr, dest_rq,
-			 TASK_UPDATE, wallclock, 0);
-
-	/*
-	 * In case of migration of task on runqueue, on_rq =1,
-	 * however its load is removed from its runqueue.
-	 * update_task_ravg() below can update its demand, which
-	 * will require its load on runqueue to be adjusted to
-	 * reflect new demand. Restore load temporarily for such
-	 * task on its runqueue
-	 */
-	if (p->on_rq) {
-		inc_cumulative_runnable_avg(src_rq, p);
-		if (p->sched_class == &fair_sched_class)
-			inc_nr_big_small_task(src_rq, p);
-	}
-
-	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
-			 wallclock, 0);
-
-	/*
-	 * Remove task's load from rq as its now migrating to
-	 * another cpu.
-	 */
-	if (p->on_rq) {
-		dec_cumulative_runnable_avg(src_rq, p);
-		if (p->sched_class == &fair_sched_class)
-			dec_nr_big_small_task(src_rq, p);
-	}
-
-	if (p->ravg.flags & CURR_WINDOW_CONTRIB) {
-		src_rq->curr_runnable_sum -= p->ravg.partial_demand;
-		dest_rq->curr_runnable_sum += p->ravg.partial_demand;
-	}
-
-	if (p->ravg.flags & PREV_WINDOW_CONTRIB) {
-		src_rq->prev_runnable_sum -= p->ravg.demand;
-		dest_rq->prev_runnable_sum += p->ravg.demand;
-	}
+#else	/* CONFIG_SCHED_HMP */
 
-	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
-	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
 
-	trace_sched_migration_update_sum(src_rq, p);
-	trace_sched_migration_update_sum(dest_rq, p);
-
-done:
-	if (p->state == TASK_WAKING)
-		double_rq_unlock(src_rq, dest_rq);
-}
-
-/* A long sleep is defined as sleeping at least one full window prior
- * to the current window start. */
-static inline int is_long_sleep(struct rq *rq, struct task_struct *p)
-{
-	if (p->ravg.mark_start > rq->window_start)
-		return 0;
-
-	return ((rq->window_start - p->ravg.mark_start) > sched_ravg_window);
-}
-
-#else	/* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
-
-static inline int is_long_sleep(struct rq *rq, struct task_struct *p)
+static inline int
+heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
 {
 	return 0;
 }
@@ -2331,11 +2578,6 @@ update_task_ravg(struct task_struct *p, struct rq *rq,
 {
 }
 
-static inline int rq_freq_margin(struct rq *rq)
-{
-	return INT_MAX;
-}
-
 static inline void init_cpu_efficiency(void) {}
 
 static inline void mark_task_starting(struct task_struct *p) {}
@@ -2344,9 +2586,7 @@ static inline void set_window_start(struct rq *rq) {}
 
 static inline void migrate_sync_cpu(int cpu) {}
 
-static inline void fixup_busy_time(struct task_struct *p, int new_cpu) {}
-
-#endif	/* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#endif	/* CONFIG_SCHED_HMP */
 
 #ifdef CONFIG_SMP
 /*
@@ -2611,9 +2851,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 
-		if (sched_enable_hmp && (p->on_rq || p->state == TASK_WAKING)
-				&& !sched_freq_legacy_mode)
-			fixup_busy_time(p, new_cpu);
+		fixup_busy_time(p, new_cpu);
 	}
 
 	__set_task_cpu(p, new_cpu);
@@ -3258,7 +3496,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	int cpu, src_cpu, success = 0;
 	int notify = 0;
 	struct migration_notify_data mnd;
-	int long_sleep = 0;
+	int heavy_task = 0;
 #ifdef CONFIG_SMP
 	struct rq *rq;
 	u64 wallclock;
@@ -3326,7 +3564,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	raw_spin_lock(&rq->lock);
 	wallclock = sched_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-	long_sleep = is_long_sleep(rq, p);
+	heavy_task = heavy_task_wakeup(p, rq, TASK_WAKE);
 	update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 	raw_spin_unlock(&rq->lock);
 
@@ -3374,10 +3612,11 @@ out:
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
 
-	if (long_sleep || !same_freq_domain(src_cpu, cpu))
+	if (!same_freq_domain(src_cpu, cpu)) {
 		check_for_freq_change(cpu_rq(cpu));
-	if (!long_sleep && !same_freq_domain(src_cpu, cpu))
 		check_for_freq_change(cpu_rq(src_cpu));
+	} else if (heavy_task)
+		check_for_freq_change(cpu_rq(cpu));
 
 	return success;
 }
@@ -3794,8 +4033,6 @@ void wake_up_new_task(struct task_struct *p)
 	}
 #endif
 	task_rq_unlock(rq, p, &flags);
-	if (init_task_load)
-		check_for_freq_change(rq);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -8891,11 +9128,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
-#ifdef CONFIG_SCHED_HMP
-		rq->nr_small_tasks = rq->nr_big_tasks = 0;
-		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
-		rq->hmp_flags = 0;
-#endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -8915,7 +9147,7 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 		rq->cur_freq = 1;
 		rq->max_freq = 1;
 		rq->min_freq = 1;
@@ -8926,8 +9158,13 @@ void __init sched_init(void)
 		rq->capacity = 1024;
 		rq->load_scale_factor = 1024;
 		rq->window_start = 0;
+		rq->nr_small_tasks = rq->nr_big_tasks = 0;
+		rq->hmp_flags = 0;
+#ifdef CONFIG_SCHED_FREQ_INPUT
 		rq->freq_requested = 0;
 		rq->freq_requested_ts = 0;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+#endif
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 		rq->cstate = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5290ec2a14bf..f9bb03279152 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -309,7 +309,7 @@ do {									\
 #ifdef CONFIG_SMP
 	P(cpu_capacity);
 #endif
-#if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_FREQ_INPUT)
+#ifdef CONFIG_SCHED_HMP
 	P(load_scale_factor);
 	P(capacity);
 	P(max_possible_capacity);
@@ -632,7 +632,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 	__P(load_avg);
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 	P(ravg.demand);
 	P(se.avg.runnable_avg_sum_scaled);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e6f9a6047e9..e3ed01a638df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2536,7 +2536,7 @@ static u32 __compute_runnable_contrib(u64 n)
 static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta);
 static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods);
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 
 /* Initial task load. Newly created tasks are assigned this load. */
 unsigned int __read_mostly sched_init_task_load_pelt;
@@ -2559,10 +2559,6 @@ unsigned int max_task_load(void)
 	return sched_ravg_window;
 }
 
-#endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
-
-#ifdef CONFIG_SCHED_HMP
-
 /* Use this knob to turn on or off HMP-aware task placement logic */
 unsigned int __read_mostly sched_enable_hmp = 0;
 
@@ -2621,6 +2617,18 @@ unsigned int __read_mostly sched_small_task;
 unsigned int __read_mostly sysctl_sched_small_task_pct = 10;
 
 /*
+ * Tasks with demand >= sched_heavy_task will have their
+ * window-based demand added to the previous window's CPU
+ * time when they wake up, if they have slept for at least
+ * one full window. This feature is disabled when the tunable
+ * is set to 0 (the default).
+ */
+#ifdef CONFIG_SCHED_FREQ_INPUT
+unsigned int __read_mostly sysctl_sched_heavy_task_pct;
+unsigned int __read_mostly sched_heavy_task;
+#endif
+
+/*
  * Tasks whose bandwidth consumption on a cpu is more than
  * sched_upmigrate are considered "big" tasks. Big tasks will be
  * considered for "up" migration, i.e migrating to a cpu with better
@@ -2678,6 +2686,11 @@ void set_hmp_defaults(void)
 	sched_downmigrate =
 		pct_to_real(sysctl_sched_downmigrate_pct);
 
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	sched_heavy_task =
+		pct_to_real(sysctl_sched_heavy_task_pct);
+#endif
+
 	sched_init_task_load_pelt =
 		div64_u64((u64)sysctl_sched_init_task_load_pct *
 			  (u64)LOAD_AVG_MAX, 100);
@@ -3545,29 +3558,27 @@ static inline int capacity(struct rq *rq)
 
 #endif	/* CONFIG_SCHED_HMP */
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 
 void init_new_task_load(struct task_struct *p)
 {
 	int i;
 
-	p->ravg.sum		= 0;
-	p->ravg.flags = 0;
+	memset(&p->ravg, 0, sizeof(struct ravg));
 
 	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
 		p->ravg.sum_history[i] = sched_init_task_load_windows;
 	p->se.avg.runnable_avg_sum_scaled = sched_init_task_load_pelt;
 	p->ravg.demand = sched_init_task_load_windows;
-	p->ravg.partial_demand = sched_init_task_load_windows;
 }
 
-#else /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#else /* CONFIG_SCHED_HMP */
 
 void init_new_task_load(struct task_struct *p)
 {
 }
 
-#endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_HMP */
 
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
@@ -3948,7 +3959,7 @@ static inline int idle_balance(struct rq *rq)
 
 #endif /* CONFIG_SMP */
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 
 /* Return task demand in percentage scale */
 unsigned int pct_task_load(struct task_struct *p)
@@ -3998,7 +4009,7 @@ static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
 			   periods);
 }
 
-#else  /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#else  /* CONFIG_SCHED_HMP */
 
 static inline void
 add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
@@ -4009,7 +4020,7 @@ static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
 {
 }
 
-#endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#endif /* CONFIG_SCHED_HMP */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index decbbd2c6f48..1fc108e5cdda 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -644,15 +644,13 @@ struct rq {
 	u64 max_idle_balance_cost;
 #endif
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 	/*
 	 * max_freq = user or thermal defined maximum
 	 * max_possible_freq = maximum supported by hardware
 	 */
 	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
 	struct cpumask freq_domain_cpumask;
-	unsigned int freq_requested;
-	u64 freq_requested_ts;
 
 	u64 cumulative_runnable_avg;
 	int efficiency; /* Differentiate cpus with different IPC capability */
@@ -661,6 +659,13 @@ struct rq {
 	int max_possible_capacity;
 	u64 window_start;
 
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	unsigned int freq_requested;
+	u64 freq_requested_ts;
+#endif
+#endif
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
 	u64 curr_runnable_sum;
 	u64 prev_runnable_sum;
 #endif
@@ -943,7 +948,7 @@ static inline void sched_ttwu_pending(void) { }
 
 extern void init_new_task_load(struct task_struct *p);
 
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
+#ifdef CONFIG_SCHED_HMP
 
 #define WINDOW_STATS_RECENT		0
 #define WINDOW_STATS_MAX		1
@@ -982,6 +987,8 @@ static inline u64 scale_load_to_cpu(u64 load, int cpu)
 	return load;
 }
 #endif
+extern unsigned int sched_heavy_task;
+extern void fixup_nr_big_small_task(int cpu);
 unsigned int max_task_load(void);
 extern void sched_account_irqtime(int cpu, struct task_struct *curr,
 				 u64 delta, u64 wallclock);
@@ -1007,7 +1014,7 @@ dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
 	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
 }
 
-#else	/* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#else	/* CONFIG_SCHED_HMP */
 
 static inline int pct_task_load(struct task_struct *p) { return 0; }
 
@@ -1036,7 +1043,7 @@ static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
 {
 }
 
-#endif	/* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#endif	/* CONFIG_SCHED_HMP */
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
 extern void check_for_freq_change(struct rq *rq);
@@ -1052,15 +1059,9 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
 	return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
 }
 
-#ifdef CONFIG_SCHED_HMP
-#define init_task_load	sysctl_sched_init_task_load_pct
-#else
-#define init_task_load	0
-#endif
-
 #else	/* CONFIG_SCHED_FREQ_INPUT */
 
-#define init_task_load	0
+#define sched_migration_fixup	0
 
 static inline void check_for_freq_change(struct rq *rq) { }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d54b6b1bfed..700686eb6323 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -292,6 +292,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sched_wakeup_load_threshold",
+		.data		= &sysctl_sched_wakeup_load_threshold,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_SCHED_FREQ_INPUT
 	{
 		.procname	= "sched_freq_inc_notify_slack_pct",
@@ -307,16 +314,37 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#endif
-#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
 	{
-		.procname       = "sched_freq_legacy_mode",
-		.data           = &sysctl_sched_freq_legacy_mode,
+		.procname       = "sched_migration_fixup",
+		.data           = &sysctl_sched_migration_fixup,
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = sched_window_update_handler,
 	},
 	{
+		.procname       = "sched_freq_account_wait_time",
+		.data           = &sysctl_sched_freq_account_wait_time,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_window_update_handler,
+	},
+	{
+		.procname       = "sched_heavy_task",
+		.data           = &sysctl_sched_heavy_task_pct,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_hmp_proc_update_handler,
+	},
+	{
+		.procname	= "sched_gov_response_time",
+		.data		= &sysctl_sched_gov_response_time,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_SCHED_HMP
+	{
 		.procname       = "sched_account_wait_time",
 		.data           = &sysctl_sched_account_wait_time,
 		.maxlen         = sizeof(unsigned int),
@@ -338,22 +366,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = sched_window_update_handler,
 	},
 	{
-		.procname	= "sched_gov_response_time",
-		.data		= &sysctl_sched_gov_response_time,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_wakeup_load_threshold",
-		.data		= &sysctl_sched_wakeup_load_threshold,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
-#ifdef CONFIG_SCHED_HMP
-	{
 		.procname	= "sched_small_task",
 		.data		= &sysctl_sched_small_task_pct,
 		.maxlen		= sizeof(unsigned int),
author	Srivatsa Vaddagiri <vatsa@codeaurora.org>	2014-09-01 13:26:53 +0530
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 20:00:50 -0700
commit	3a67b4ce87bf96312d4c5728047d830a66258854 (patch)
tree	bd392503fe5c0f2da5231f1865cc86fd8b7b0672
parent	977dc392f77522559697cbedfb2d48ad7be96aec (diff)