8 files changed, 764 insertions, 158 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e107c3d7a5c..61a5c00e66cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -328,6 +328,16 @@ enum task_event {
 	IRQ_UPDATE	= 5,
 };
 
+/* Note: this need to be in sync with migrate_type_names array */
+enum migrate_types {
+	GROUP_TO_RQ,
+	RQ_TO_GROUP,
+	RQ_TO_RQ,
+	GROUP_TO_GROUP,
+};
+
+extern const char *migrate_type_names[];
+
 #include <linux/spinlock.h>
 
 /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 84bac3e07709..2ac84af88802 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -75,6 +75,7 @@ extern unsigned int sysctl_sched_restrict_cluster_spill;
 #if defined(CONFIG_SCHED_FREQ_INPUT)
 extern unsigned int sysctl_sched_new_task_windows;
 extern unsigned int sysctl_sched_pred_alert_freq;
+extern unsigned int sysctl_sched_freq_aggregate;
 #endif
 
 #else /* CONFIG_SCHED_HMP */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 6c5fa35e2875..81415b78ef39 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -9,6 +9,8 @@
 #include <linux/binfmts.h>
 
 struct rq;
+struct group_cpu_time;
+struct migration_sum_data;
 extern const char *task_event_names[];
 
 /*
@@ -269,9 +271,10 @@ TRACE_EVENT(sched_set_boost,
 TRACE_EVENT(sched_update_task_ravg,
 
 	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
-		 u64 wallclock, u64 irqtime, u32 cycles, u32 exec_time),
+		 u64 wallclock, u64 irqtime, u32 cycles, u32 exec_time,
+		 struct group_cpu_time *cpu_time),
 
-	TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time),
+	TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time, cpu_time),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,   TASK_COMM_LEN	)
@@ -290,8 +293,12 @@ TRACE_EVENT(sched_update_task_ravg,
 		__field(	 int,	cpu			)
 #ifdef CONFIG_SCHED_FREQ_INPUT
 		__field(unsigned int,	pred_demand		)
-		__field(	u64,	cs			)
-		__field(	u64,	ps			)
+		__field(	u64,	rq_cs			)
+		__field(	u64,	rq_ps			)
+		__field(	u64,	grp_cs			)
+		__field(	u64,	grp_ps			)
+		__field(	u64,	grp_nt_cs			)
+		__field(	u64,	grp_nt_ps			)
 		__field(	u32,	curr_window		)
 		__field(	u32,	prev_window		)
 		__field(	u64,	nt_cs			)
@@ -318,8 +325,12 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->irqtime        = irqtime;
 #ifdef CONFIG_SCHED_FREQ_INPUT
 		__entry->pred_demand     = p->ravg.pred_demand;
-		__entry->cs             = rq->curr_runnable_sum;
-		__entry->ps             = rq->prev_runnable_sum;
+		__entry->rq_cs          = rq->curr_runnable_sum;
+		__entry->rq_ps          = rq->prev_runnable_sum;
+		__entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0;
+		__entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0;
+		__entry->grp_nt_cs = cpu_time ? cpu_time->nt_curr_runnable_sum : 0;
+		__entry->grp_nt_ps = cpu_time ? cpu_time->nt_prev_runnable_sum : 0;
 		__entry->curr_window	= p->ravg.curr_window;
 		__entry->prev_window	= p->ravg.prev_window;
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
@@ -330,7 +341,7 @@ TRACE_EVENT(sched_update_task_ravg,
 
 	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
 #ifdef CONFIG_SCHED_FREQ_INPUT
-		" pred_demand %u cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
+		" pred_demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
 #endif
 		, __entry->wallclock, __entry->win_start, __entry->delta,
 		task_event_names[__entry->evt], __entry->cpu,
@@ -339,10 +350,12 @@ TRACE_EVENT(sched_update_task_ravg,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime
 #ifdef CONFIG_SCHED_FREQ_INPUT
-		, __entry->pred_demand, __entry->cs, __entry->ps,
+		, __entry->pred_demand, __entry->rq_cs, __entry->rq_ps,
 		__entry->curr_window, __entry->prev_window,
 		  __entry->nt_cs, __entry->nt_ps,
-		  __entry->active_windows
+		  __entry->active_windows,
+		__entry->grp_cs, __entry->grp_ps,
+		__entry->grp_nt_cs, __entry->grp_nt_ps
 #endif
 		)
 );
@@ -506,31 +519,62 @@ TRACE_EVENT(sched_update_pred_demand,
 
 TRACE_EVENT(sched_migration_update_sum,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct migration_sum_data *d),
 
-	TP_ARGS(rq, p),
+	TP_ARGS(p, migrate_type, d),
 
 	TP_STRUCT__entry(
-		__field(int,		cpu			)
+		__field(int,		tcpu			)
 		__field(int,		pid			)
 		__field(	u64,	cs			)
 		__field(	u64,	ps			)
 		__field(	s64,	nt_cs			)
 		__field(	s64,	nt_ps			)
+		__field(enum migrate_types,	migrate_type	)
+		__field(	s64,	src_cs			)
+		__field(	s64,	src_ps			)
+		__field(	s64,	dst_cs			)
+		__field(	s64,	dst_ps			)
+		__field(	s64,	src_nt_cs		)
+		__field(	s64,	src_nt_ps		)
+		__field(	s64,	dst_nt_cs		)
+		__field(	s64,	dst_nt_ps		)
 	),
 
 	TP_fast_assign(
-		__entry->cpu		= cpu_of(rq);
-		__entry->cs		= rq->curr_runnable_sum;
-		__entry->ps		= rq->prev_runnable_sum;
-		__entry->nt_cs		= (s64)rq->nt_curr_runnable_sum;
-		__entry->nt_ps		= (s64)rq->nt_prev_runnable_sum;
+		__entry->tcpu		= task_cpu(p);
 		__entry->pid		= p->pid;
-	),
-
-	TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
-		  __entry->cpu, __entry->cs, __entry->ps,
-		  __entry->nt_cs, __entry->nt_ps, __entry->pid)
+		__entry->migrate_type	= migrate_type;
+		__entry->src_cs		= d->src_rq ?
+						d->src_rq->curr_runnable_sum :
+						d->src_cpu_time->curr_runnable_sum;
+		__entry->src_ps		= d->src_rq ?
+						d->src_rq->prev_runnable_sum :
+						d->src_cpu_time->prev_runnable_sum;
+		__entry->dst_cs		= d->dst_rq ?
+						d->dst_rq->curr_runnable_sum :
+						d->dst_cpu_time->curr_runnable_sum;
+		__entry->dst_ps		= d->dst_rq ?
+						d->dst_rq->prev_runnable_sum :
+						d->dst_cpu_time->prev_runnable_sum;
+		__entry->src_nt_cs		= d->src_rq ?
+						d->src_rq->nt_curr_runnable_sum :
+						d->src_cpu_time->nt_curr_runnable_sum;
+		__entry->src_nt_ps		= d->src_rq ?
+						d->src_rq->nt_prev_runnable_sum :
+						d->src_cpu_time->nt_prev_runnable_sum;
+		__entry->dst_nt_cs		= d->dst_rq ?
+						d->dst_rq->nt_curr_runnable_sum :
+						d->dst_cpu_time->nt_curr_runnable_sum;
+		__entry->dst_nt_ps		= d->dst_rq ?
+						d->dst_rq->nt_prev_runnable_sum :
+						d->dst_cpu_time->nt_prev_runnable_sum;
+	),
+
+	TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld",
+		__entry->pid, __entry->tcpu, migrate_type_names[__entry->migrate_type],
+		__entry->src_cs, __entry->src_ps, __entry->dst_cs, __entry->dst_ps,
+		__entry->src_nt_cs, __entry->src_nt_ps, __entry->dst_nt_cs, __entry->dst_nt_ps)
 );
 
 TRACE_EVENT(sched_get_busy,
@@ -562,15 +606,17 @@ TRACE_EVENT(sched_get_busy,
 
 TRACE_EVENT(sched_freq_alert,
 
-	TP_PROTO(int cpu, int pd_notif, u64 old_load, u64 new_load,
-		u64 old_pred, u64 new_pred),
+	TP_PROTO(int cpu, int pd_notif, int check_groups, struct rq *rq,
+		u64 new_load),
 
-	TP_ARGS(cpu, pd_notif, old_load, new_load, old_pred, new_pred),
+	TP_ARGS(cpu, pd_notif, check_groups, rq, new_load),
 
 	TP_STRUCT__entry(
 		__field(	int,	cpu			)
 		__field(	int,	pd_notif		)
-		__field(	u64,	old_load		)
+		__field(	int,	check_groups		)
+		__field(	u64,	old_busy_time		)
+		__field(	u64,	ps			)
 		__field(	u64,	new_load		)
 		__field(	u64,	old_pred		)
 		__field(	u64,	new_pred		)
@@ -579,17 +625,18 @@ TRACE_EVENT(sched_freq_alert,
 	TP_fast_assign(
 		__entry->cpu		= cpu;
 		__entry->pd_notif	= pd_notif;
-		__entry->old_load	= old_load;
+		__entry->check_groups	= check_groups;
+		__entry->old_busy_time	= rq->old_busy_time;
+		__entry->ps		= rq->prev_runnable_sum;
 		__entry->new_load	= new_load;
-		__entry->old_pred	= old_pred;
-		__entry->new_pred	= new_pred;
+		__entry->old_pred	= rq->old_estimated_time;
+		__entry->new_pred	= rq->hmp_stats.pred_demands_sum;
 	),
 
-	TP_printk("cpu %d pd_notif=%d old_load=%llu new_load=%llu "
-		"old_pred=%llu new_pred=%llu",
-		__entry->cpu, __entry->pd_notif, __entry->old_load,
-		__entry->new_load, __entry->old_pred,
-		 __entry->new_pred)
+	TP_printk("cpu %d pd_notif=%d check_groups %d old_busy_time=%llu prev_sum=%lld new_load=%llu old_pred=%llu new_pred=%llu",
+		__entry->cpu, __entry->pd_notif, __entry->check_groups,
+		__entry->old_busy_time, __entry->ps, __entry->new_load,
+		__entry->old_pred, __entry->new_pred)
 );
 
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b55bbbd7431..87e93b3f3b4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,6 +97,9 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
 				  "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
 				"IRQ_UPDATE"};
 
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
+					 "RQ_TO_RQ", "GROUP_TO_GROUP"};
+
 ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
 ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
 
@@ -1864,6 +1867,61 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
 
 static unsigned int sync_cpu;
 
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+	list_for_each_entry(grp, &related_thread_groups, list)
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ *	One related thread group A that has tasks A0, A1, A2
+ *
+ *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ *	tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ *	not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ *		Task A0 ran 5ms on CPU0
+ *		Task B0 ran 1ms on CPU0
+ *
+ *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ *		Task A1 ran 4ms on CPU1
+ *		Task A2 ran 2ms on CPU1
+ *		Task B1 ran 5ms on CPU1
+ *
+ *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ *		CPU2 idle
+ *
+ *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ *		CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ *	C0 busy time = 1ms
+ *	C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate;
+__read_mostly unsigned int sysctl_sched_freq_aggregate;
+
 #define EXITING_TASK_MARKER	0xdeaddead
 
 static inline int exiting_task(struct task_struct *p)
@@ -1955,12 +2013,67 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
 	return freq;
 }
 
-/* Should scheduler alert governor for changing frequency? */
-static int send_notification(struct rq *rq, int check_pred)
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu);
+
+/*
+ * Return load from all related group in given cpu.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
+{
+	struct related_thread_group *grp;
+
+	for_each_related_thread_group(grp) {
+		struct group_cpu_time *cpu_time;
+
+		cpu_time = _group_cpu_time(grp, cpu);
+		*grp_load += cpu_time->prev_runnable_sum;
+		if (new_grp_load)
+			*new_grp_load += cpu_time->nt_prev_runnable_sum;
+	}
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+				u64 *grp_load, u64 *new_grp_load)
+{
+	struct related_thread_group *grp;
+	int j;
+
+	for_each_related_thread_group(grp) {
+		for_each_cpu(j, cpus) {
+			struct group_cpu_time *cpu_time;
+
+			cpu_time = _group_cpu_time(grp, j);
+			*grp_load += cpu_time->prev_runnable_sum;
+			*new_grp_load += cpu_time->nt_prev_runnable_sum;
+		}
+	}
+}
+
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
 {
 	unsigned int cur_freq, freq_required;
 	unsigned long flags;
 	int rc = 0;
+	u64 group_load = 0, new_load;
 
 	if (!sched_enable_hmp)
 		return 0;
@@ -1982,8 +2095,22 @@ static int send_notification(struct rq *rq, int check_pred)
 		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
 			return 0;
 	} else {
+		read_lock(&related_thread_group_lock);
+		/*
+		 * Protect from concurrent update of rq->prev_runnable_sum and
+		 * group cpu load
+		 */
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (check_groups)
+			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+
+		new_load = rq->prev_runnable_sum + group_load;
+
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		read_unlock(&related_thread_group_lock);
+
 		cur_freq = load_to_freq(rq, rq->old_busy_time);
-		freq_required = load_to_freq(rq, rq->prev_runnable_sum);
+		freq_required = load_to_freq(rq, new_load);
 
 		if (nearly_same_freq(cur_freq, freq_required))
 			return 0;
@@ -1993,6 +2120,8 @@ static int send_notification(struct rq *rq, int check_pred)
 	if (!rq->notifier_sent) {
 		rq->notifier_sent = 1;
 		rc = 1;
+		trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+				       new_load);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -2000,17 +2129,13 @@ static int send_notification(struct rq *rq, int check_pred)
 }
 
 /* Alert governor if there is a need to change frequency */
-void check_for_freq_change(struct rq *rq, bool check_pred)
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
 {
 	int cpu = cpu_of(rq);
 
-	if (!send_notification(rq, check_pred))
+	if (!send_notification(rq, check_pred, check_groups))
 		return;
 
-	trace_sched_freq_alert(cpu, check_pred, rq->old_busy_time,
-			rq->prev_runnable_sum, rq->old_estimated_time,
-			rq->hmp_stats.pred_demands_sum);
-
 	atomic_notifier_call_chain(
 		&load_alert_notifier_head, 0,
 		(void *)(long)cpu);
@@ -2031,11 +2156,21 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
 	if (event == TASK_WAKE)
 		return 0;
 
-	if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
-					 event == TASK_UPDATE)
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
 		return 1;
 
-	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? sched_freq_account_wait_time : 0;
+	}
+
+	/* TASK_MIGRATE, PICK_NEXT_TASK left */
 	return sched_freq_account_wait_time;
 }
 
@@ -2262,6 +2397,15 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 			 event != PICK_NEXT_TASK)))
 		return;
 
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (!p->on_rq && !sched_freq_account_wait_time)
+			return;
+	}
+
 	new = calc_pred_demand(rq, p);
 	old = p->ravg.pred_demand;
 
@@ -2290,7 +2434,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	u64 window_start = rq->window_start;
 	u32 window_size = sched_ravg_window;
 	u64 delta;
+	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+	int flip_counters = 0;
+	int prev_sum_reset = 0;
 	bool new_task;
+	struct related_thread_group *grp;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2302,6 +2453,51 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 
 	new_task = is_new_task(p);
 
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		/* cpu_time protected by rq_lock */
+		struct group_cpu_time *cpu_time =
+			_group_cpu_time(grp, cpu_of(rq));
+
+		curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		if (cpu_time->window_start != rq->window_start) {
+			int nr_windows;
+
+			delta = rq->window_start - cpu_time->window_start;
+			nr_windows = div64_u64(delta, window_size);
+			if (nr_windows > 1)
+				prev_sum_reset = 1;
+
+			cpu_time->window_start = rq->window_start;
+			flip_counters = 1;
+		}
+
+		if (p_is_curr_task && new_window) {
+			u64 curr_sum = rq->curr_runnable_sum;
+			u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+
+			if (nr_full_windows)
+				curr_sum = nt_curr_sum = 0;
+
+			rq->prev_runnable_sum = curr_sum;
+			rq->nt_prev_runnable_sum = nt_curr_sum;
+
+			rq->curr_runnable_sum = 0;
+			rq->nt_curr_runnable_sum = 0;
+		}
+	} else {
+		if (p_is_curr_task && new_window) {
+			flip_counters = 1;
+			if (nr_full_windows)
+				prev_sum_reset = 1;
+		}
+	}
+
 	/* Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks. */
 	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
@@ -2314,6 +2510,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		p->ravg.curr_window = 0;
 	}
 
+	if (flip_counters) {
+		u64 curr_sum = *curr_runnable_sum;
+		u64 nt_curr_sum = *nt_curr_runnable_sum;
+
+		if (prev_sum_reset)
+			curr_sum = nt_curr_sum = 0;
+
+		*prev_runnable_sum = curr_sum;
+		*nt_prev_runnable_sum = nt_curr_sum;
+
+		*curr_runnable_sum = 0;
+		*nt_curr_runnable_sum = 0;
+	}
+
 	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
 		/* account_busy_for_cpu_time() = 0, so no update to the
 		 * task's current window needs to be made. This could be
@@ -2331,19 +2541,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		/* A new window has started. The RQ demand must be rolled
 		 * over if p is the current task. */
 		if (p_is_curr_task) {
-			u64 prev_sum = 0, nt_prev_sum = 0;
-
-			/* p is either idle task or an exiting task */
-			if (!nr_full_windows) {
-				prev_sum = rq->curr_runnable_sum;
-				nt_prev_sum = rq->nt_curr_runnable_sum;
-			}
-
-			rq->prev_runnable_sum = prev_sum;
-			rq->curr_runnable_sum = 0;
-			rq->nt_prev_runnable_sum = nt_prev_sum;
-			rq->nt_curr_runnable_sum = 0;
-
+			/* p is idle task */
+			BUG_ON(p != rq->idle);
 		} else if (heavy_task_wakeup(p, rq, event)) {
 			/* A new window has started. If p is a waking
 			 * heavy task its prev_window contribution is faked
@@ -2353,9 +2552,9 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			 * can be controlled via the sched_heavy_task
 			 * tunable. */
 			p->ravg.prev_window = p->ravg.demand;
-			rq->prev_runnable_sum += p->ravg.demand;
+			*prev_runnable_sum += p->ravg.demand;
 			if (new_task)
-				rq->nt_prev_runnable_sum += p->ravg.demand;
+				*nt_prev_runnable_sum += p->ravg.demand;
 		}
 
 		return;
@@ -2373,9 +2572,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		else
 			delta = irqtime;
 		delta = scale_exec_time(delta, rq, cc);
-		rq->curr_runnable_sum += delta;
+		*curr_runnable_sum += delta;
 		if (new_task)
-			rq->nt_curr_runnable_sum += delta;
+			*nt_curr_runnable_sum += delta;
+
 		if (!is_idle_task(p) && !exiting_task(p))
 			p->ravg.curr_window += delta;
 
@@ -2409,15 +2609,17 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			if (!exiting_task(p))
 				p->ravg.prev_window = delta;
 		}
-		rq->prev_runnable_sum += delta;
+
+		*prev_runnable_sum += delta;
 		if (new_task)
-			rq->nt_prev_runnable_sum += delta;
+			*nt_prev_runnable_sum += delta;
 
 		/* Account piece of busy time in the current window. */
 		delta = scale_exec_time(wallclock - window_start, rq, cc);
-		rq->curr_runnable_sum += delta;
+		*curr_runnable_sum += delta;
 		if (new_task)
-			rq->nt_curr_runnable_sum += delta;
+			*nt_curr_runnable_sum += delta;
+
 		if (!exiting_task(p))
 			p->ravg.curr_window = delta;
 
@@ -2444,12 +2646,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 						cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window += delta;
-
-			rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
-			if (new_task)
-				rq->nt_prev_runnable_sum += delta;
-
-			delta += rq->curr_runnable_sum;
 		} else {
 			/* Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
@@ -2457,27 +2653,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			delta = scale_exec_time(window_size, rq, cc);
 			if (!is_idle_task(p) && !exiting_task(p))
 				p->ravg.prev_window = delta;
-
-			if (new_task)
-				rq->nt_prev_runnable_sum = delta;
-			else
-				rq->nt_prev_runnable_sum = 0;
 		}
-		/*
-		 * Rollover for normal runnable sum is done here by overwriting
-		 * the values in prev_runnable_sum and curr_runnable_sum.
-		 * Rollover for new task runnable sum has completed by previous
-		 * if-else statement.
-		 */
-		rq->prev_runnable_sum = delta;
+
+		/* Rollover is done here by overwriting the values in
+		 * prev_runnable_sum and curr_runnable_sum. */
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
 
 		/* Account piece of busy time in the current window. */
 		delta = scale_exec_time(wallclock - window_start, rq, cc);
-		rq->curr_runnable_sum = delta;
+		*curr_runnable_sum += delta;
 		if (new_task)
-			rq->nt_curr_runnable_sum = delta;
-		else
-			rq->nt_curr_runnable_sum = 0;
+			*nt_curr_runnable_sum += delta;
+
 		if (!is_idle_task(p) && !exiting_task(p))
 			p->ravg.curr_window = delta;
 
@@ -2500,12 +2689,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 
 		/* Roll window over. If IRQ busy time was just in the current
 		 * window then that is all that need be accounted. */
-		rq->prev_runnable_sum = rq->curr_runnable_sum;
-		rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
-		rq->nt_curr_runnable_sum = 0;
 		if (mark_start > window_start) {
-			rq->curr_runnable_sum = scale_exec_time(irqtime, rq,
-								cc);
+			*curr_runnable_sum = scale_exec_time(irqtime, rq, cc);
 			return;
 		}
 
@@ -2515,7 +2700,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		if (delta > window_size)
 			delta = window_size;
 		delta = scale_exec_time(delta, rq, cc);
-		rq->prev_runnable_sum += delta;
+		*prev_runnable_sum += delta;
 
 		/* Process the remaining IRQ busy time in the current window. */
 		delta = wallclock - window_start;
@@ -2820,7 +3005,8 @@ update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 	update_task_pred_demand(rq, p, event);
 done:
 	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
-				     cc.cycles, cc.time);
+				     cc.cycles, cc.time,
+				     _group_cpu_time(p->grp, cpu_of(rq)));
 
 	p->ravg.mark_start = wallclock;
 
@@ -3002,7 +3188,8 @@ enum reset_reason_code {
 	ACCOUNT_WAIT_TIME_CHANGE,
 	HIST_SIZE_CHANGE,
 	MIGRATION_FIXUP_CHANGE,
-	FREQ_ACCOUNT_WAIT_TIME_CHANGE
+	FREQ_ACCOUNT_WAIT_TIME_CHANGE,
+	FREQ_AGGREGATE_CHANGE,
 };
 
 const char *sched_window_reset_reasons[] = {
@@ -3021,6 +3208,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	u64 start_ts = sched_ktime_clock();
 	int reason = WINDOW_CHANGE;
 	unsigned int old = 0, new = 0;
+	struct related_thread_group *grp;
 
 	disable_window_stats();
 
@@ -3028,11 +3216,26 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 	local_irq_save(flags);
 
+	read_lock(&related_thread_group_lock);
+
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		raw_spin_lock(&rq->lock);
 	}
 
+	list_for_each_entry(grp, &related_thread_groups, list) {
+		int j;
+
+		for_each_possible_cpu(j) {
+			struct group_cpu_time *cpu_time;
+			/* Protected by rq lock */
+			cpu_time = _group_cpu_time(grp, j);
+			memset(cpu_time, 0, sizeof(struct group_cpu_time));
+			if (window_start)
+				cpu_time->window_start = window_start;
+		}
+	}
+
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
@@ -3081,6 +3284,12 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 		new = sysctl_sched_freq_account_wait_time;
 		sched_freq_account_wait_time =
 				 sysctl_sched_freq_account_wait_time;
+	} else if (sched_freq_aggregate !=
+					sysctl_sched_freq_aggregate) {
+		reason = FREQ_AGGREGATE_CHANGE;
+		old = sched_freq_aggregate;
+		new = sysctl_sched_freq_aggregate;
+		sched_freq_aggregate = sysctl_sched_freq_aggregate;
 	}
 #endif
 
@@ -3089,6 +3298,8 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 		raw_spin_unlock(&rq->lock);
 	}
 
+	read_unlock(&related_thread_group_lock);
+
 	local_irq_restore(flags);
 
 	trace_sched_reset_all_window_stats(window_start, window_size,
@@ -3097,13 +3308,17 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
 
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+
 void sched_get_cpus_busy(struct sched_load *busy,
 			 const struct cpumask *query_cpus)
 {
 	unsigned long flags;
 	struct rq *rq;
 	const int cpus = cpumask_weight(query_cpus);
-	u64 load[cpus], nload[cpus];
+	u64 load[cpus], group_load[cpus];
+	u64 nload[cpus], ngload[cpus];
 	u64 pload[cpus];
 	unsigned int cur_freq[cpus], max_freq[cpus];
 	int notifier_sent[cpus];
@@ -3111,6 +3326,9 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	int cpu, i = 0;
 	unsigned int window_size;
 	struct cpu_cycle cc;
+	u64 max_prev_sum = 0;
+	int max_busy_cpu = cpumask_first(query_cpus);
+	struct related_thread_group *grp;
 
 	if (unlikely(cpus == 0))
 		return;
@@ -3120,6 +3338,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	 * current task may have been executing for a long time. Ensure
 	 * that the window stats are current by doing an update.
 	 */
+	read_lock(&related_thread_group_lock);
+
 	local_irq_save(flags);
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_lock(&cpu_rq(cpu)->lock);
@@ -3137,6 +3357,49 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		nload[i] = rq->nt_prev_runnable_sum;
 		pload[i] = rq->hmp_stats.pred_demands_sum;
 		rq->old_estimated_time = pload[i];
+
+		if (load[i] > max_prev_sum) {
+			max_prev_sum = load[i];
+			max_busy_cpu = cpu;
+		}
+
+		notifier_sent[i] = rq->notifier_sent;
+		early_detection[i] = (rq->ed_task != NULL);
+		rq->notifier_sent = 0;
+		cur_freq[i] = cpu_cur_freq(cpu);
+		max_freq[i] = cpu_max_freq(cpu);
+		i++;
+	}
+
+	for_each_related_thread_group(grp) {
+		for_each_cpu(cpu, query_cpus) {
+			/* Protected by rq_lock */
+			struct group_cpu_time *cpu_time =
+						_group_cpu_time(grp, cpu);
+			sync_window_start(cpu_rq(cpu), cpu_time);
+		}
+	}
+
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		group_load[i] = 0;
+		ngload[i] = 0;
+
+		if (early_detection[i])
+			goto skip_early;
+
+		rq = cpu_rq(cpu);
+		if (!notifier_sent[i]) {
+			if (cpu == max_busy_cpu)
+				group_load_in_freq_domain(
+					&rq->freq_domain_cpumask,
+					&group_load[i], &ngload[i]);
+		} else {
+			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+		}
+
+		load[i] += group_load[i];
+		nload[i] += ngload[i];
 		/*
 		 * Scale load in reference to cluster max_possible_freq.
 		 *
@@ -3146,11 +3409,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		load[i] = scale_load_to_cpu(load[i], cpu);
 		nload[i] = scale_load_to_cpu(nload[i], cpu);
 		pload[i] = scale_load_to_cpu(pload[i], cpu);
-
-		notifier_sent[i] = rq->notifier_sent;
-		early_detection[i] = (rq->ed_task != NULL);
-		rq->notifier_sent = 0;
-		max_freq[i] = cpu_max_freq(cpu);
+skip_early:
 		i++;
 	}
 
@@ -3158,6 +3417,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		raw_spin_unlock(&(cpu_rq(cpu))->lock);
 	local_irq_restore(flags);
 
+	read_unlock(&related_thread_group_lock);
+
 	i = 0;
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
@@ -3205,17 +3466,6 @@ exit_early:
 	}
 }
 
-unsigned long sched_get_busy(int cpu)
-{
-	struct cpumask query_cpu = CPU_MASK_NONE;
-	struct sched_load busy;
-
-	cpumask_set_cpu(cpu, &query_cpu);
-	sched_get_cpus_busy(&busy, &query_cpu);
-
-	return busy.prev_load;
-}
-
 void sched_set_io_is_busy(int val)
 {
 	sched_io_is_busy = val;
@@ -3267,7 +3517,14 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 	struct rq *src_rq = task_rq(p);
 	struct rq *dest_rq = cpu_rq(new_cpu);
 	u64 wallclock;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	int migrate_type;
+	struct migration_sum_data d;
 	bool new_task;
+	struct related_thread_group *grp;
 
 	if (!sched_enable_hmp || !sched_migration_fixup ||
 		 (!p->on_rq && p->state != TASK_WAKING))
@@ -3298,22 +3555,62 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 	update_task_cpu_cycles(p, new_cpu);
 
 	new_task = is_new_task(p);
+	/* Protected by rq_lock */
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		struct group_cpu_time *cpu_time;
+
+		migrate_type = GROUP_TO_GROUP;
+		/* Protected by rq_lock */
+		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+		d.src_rq = NULL;
+		d.src_cpu_time = cpu_time;
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		/* Protected by rq_lock */
+		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+		d.dst_rq = NULL;
+		d.dst_cpu_time = cpu_time;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		sync_window_start(dest_rq, cpu_time);
+	} else {
+		migrate_type = RQ_TO_RQ;
+		d.src_rq = src_rq;
+		d.src_cpu_time = NULL;
+		d.dst_rq = dest_rq;
+		d.dst_cpu_time = NULL;
+		src_curr_runnable_sum = &src_rq->curr_runnable_sum;
+		src_prev_runnable_sum = &src_rq->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
+
+		dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
+		dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+	}
 
 	if (p->ravg.curr_window) {
-		src_rq->curr_runnable_sum -= p->ravg.curr_window;
-		dest_rq->curr_runnable_sum += p->ravg.curr_window;
+		*src_curr_runnable_sum -= p->ravg.curr_window;
+		*dst_curr_runnable_sum += p->ravg.curr_window;
 		if (new_task) {
-			src_rq->nt_curr_runnable_sum -= p->ravg.curr_window;
-			dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+			*dst_nt_curr_runnable_sum += p->ravg.curr_window;
 		}
 	}
 
 	if (p->ravg.prev_window) {
-		src_rq->prev_runnable_sum -= p->ravg.prev_window;
-		dest_rq->prev_runnable_sum += p->ravg.prev_window;
+		*src_prev_runnable_sum -= p->ravg.prev_window;
+		*dst_prev_runnable_sum += p->ravg.prev_window;
 		if (new_task) {
-			src_rq->nt_prev_runnable_sum -= p->ravg.prev_window;
-			dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+			*dst_nt_prev_runnable_sum += p->ravg.prev_window;
 		}
 	}
 
@@ -3323,13 +3620,11 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 			dest_rq->ed_task = p;
 	}
 
-	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
-	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
-	BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
-	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
-
-	trace_sched_migration_update_sum(src_rq, p);
-	trace_sched_migration_update_sum(dest_rq, p);
+	trace_sched_migration_update_sum(p, migrate_type, &d);
+	BUG_ON((s64)*src_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
 
 done:
 	if (p->state == TASK_WAKING)
@@ -3368,10 +3663,6 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
 	update_up_down_migrate();
 }
 
-static LIST_HEAD(related_thread_groups);
-static DEFINE_RWLOCK(related_thread_group_lock);
-static int nr_related_thread_groups;
-
 /* Return cluster which can offer required capacity for group */
 static struct sched_cluster *
 best_cluster(struct related_thread_group *grp, u64 total_demand)
@@ -3421,6 +3712,199 @@ static void set_preferred_cluster(struct related_thread_group *grp)
 	raw_spin_unlock(&grp->lock);
 }
 
+#define ADD_TASK	0
+#define REM_TASK	1
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static struct cpu_cycle
+update_task_ravg(struct task_struct *p, struct rq *rq,
+		 int event, u64 wallclock, u64 irqtime);
+
+static inline void free_group_cputime(struct related_thread_group *grp)
+{
+	free_percpu(grp->cpu_time);
+}
+
+static int alloc_group_cputime(struct related_thread_group *grp)
+{
+	int i;
+	struct group_cpu_time *cpu_time;
+	int cpu = raw_smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	u64 window_start = rq->window_start;
+
+	grp->cpu_time = alloc_percpu(struct group_cpu_time);
+	if (!grp->cpu_time)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		cpu_time = per_cpu_ptr(grp->cpu_time, i);
+		memset(cpu_time, 0, sizeof(struct group_cpu_time));
+		cpu_time->window_start = window_start;
+	}
+
+	return 0;
+}
+
+/*
+ * A group's window_start may be behind. When moving it forward, flip prev/curr
+ * counters. When moving forward > 1 window, prev counter is set to 0
+ */
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
+{
+	u64 delta;
+	int nr_windows;
+	u64 curr_sum = cpu_time->curr_runnable_sum;
+	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
+
+	delta = rq->window_start - cpu_time->window_start;
+	if (!delta)
+		return;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	if (nr_windows > 1)
+		curr_sum = nt_curr_sum = 0;
+
+	cpu_time->prev_runnable_sum  = curr_sum;
+	cpu_time->curr_runnable_sum  = 0;
+
+	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
+	cpu_time->nt_curr_runnable_sum = 0;
+
+	cpu_time->window_start = rq->window_start;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
+ *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+				struct task_struct *p, int event)
+{
+	u64 wallclock;
+	struct group_cpu_time *cpu_time;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	struct migration_sum_data d;
+	int migrate_type;
+
+	if (!sched_freq_aggregate)
+		return;
+
+	wallclock = sched_ktime_clock();
+
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+
+	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
+	cpu_time = _group_cpu_time(grp, cpu_of(rq));
+	if (event == ADD_TASK) {
+		sync_window_start(rq, cpu_time);
+		migrate_type = RQ_TO_GROUP;
+		d.src_rq = rq;
+		d.src_cpu_time = NULL;
+		d.dst_rq = NULL;
+		d.dst_cpu_time = cpu_time;
+		src_curr_runnable_sum = &rq->curr_runnable_sum;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &rq->prev_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+	} else if (event == REM_TASK) {
+		migrate_type = GROUP_TO_RQ;
+		d.src_rq = NULL;
+		d.src_cpu_time = cpu_time;
+		d.dst_rq = rq;
+		d.dst_cpu_time = NULL;
+
+		/*
+		 * In case of REM_TASK, cpu_time->window_start would be
+		 * uptodate, because of the update_task_ravg() we called
+		 * above on the moving task. Hence no need for
+		 * sync_window_start()
+		 */
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_curr_runnable_sum = &rq->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+	}
+
+	*src_curr_runnable_sum -= p->ravg.curr_window;
+	*dst_curr_runnable_sum += p->ravg.curr_window;
+
+	*src_prev_runnable_sum -= p->ravg.prev_window;
+	*dst_prev_runnable_sum += p->ravg.prev_window;
+
+	if (is_new_task(p)) {
+		*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
+		*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	trace_sched_migration_update_sum(p, migrate_type, &d);
+
+	BUG_ON((s64)*src_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_prev_runnable_sum < 0);
+}
+
+static inline struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+	return _group_cpu_time(rcu_dereference(p->grp), cpu);
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
+}
+
+#else	/* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void free_group_cputime(struct related_thread_group *grp) { }
+
+static inline int alloc_group_cputime(struct related_thread_group *grp)
+{
+	return 0;
+}
+
+static inline void transfer_busy_time(struct rq *rq,
+	 struct related_thread_group *grp, struct task_struct *p, int event)
+{
+}
+
+static struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+	return NULL;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+	return NULL;
+}
+
+#endif
+
 struct related_thread_group *alloc_related_thread_group(int group_id)
 {
 	struct related_thread_group *grp;
@@ -3429,6 +3913,11 @@ struct related_thread_group *alloc_related_thread_group(int group_id)
 	if (!grp)
 		return ERR_PTR(-ENOMEM);
 
+	if (alloc_group_cputime(grp)) {
+		kfree(grp);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	grp->id = group_id;
 	INIT_LIST_HEAD(&grp->tasks);
 	INIT_LIST_HEAD(&grp->list);
@@ -3449,6 +3938,16 @@ struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
 	return NULL;
 }
 
+/* See comments before preferred_cluster() */
+static void free_related_thread_group(struct rcu_head *rcu)
+{
+	struct related_thread_group *grp = container_of(rcu, struct
+			related_thread_group, rcu);
+
+	free_group_cputime(grp);
+	kfree(grp);
+}
+
 static void remove_task_from_group(struct task_struct *p)
 {
 	struct related_thread_group *grp = p->grp;
@@ -3458,6 +3957,7 @@ static void remove_task_from_group(struct task_struct *p)
 	raw_spin_lock(&grp->lock);
 
 	rq = __task_rq_lock(p);
+	transfer_busy_time(rq, p->grp, p, REM_TASK);
 	list_del_init(&p->grp_list);
 	rcu_assign_pointer(p->grp, NULL);
 	__task_rq_unlock(rq);
@@ -3471,9 +3971,7 @@ static void remove_task_from_group(struct task_struct *p)
 
 	if (empty_group) {
 		list_del(&grp->list);
-		nr_related_thread_groups--;
-		/* See comments before preferred_cluster() */
-		kfree_rcu(grp, rcu);
+		call_rcu(&grp->rcu, free_related_thread_group);
 	}
 }
 
@@ -3489,8 +3987,9 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
 	 * reference of p->grp in various hot-paths
 	 */
 	rq = __task_rq_lock(p);
-	rcu_assign_pointer(p->grp, grp);
+	transfer_busy_time(rq, grp, p, ADD_TASK);
 	list_add(&p->grp_list, &grp->tasks);
+	rcu_assign_pointer(p->grp, grp);
 	__task_rq_unlock(rq);
 
 	_set_preferred_cluster(grp);
@@ -3539,7 +4038,6 @@ redo:
 	} else if (!grp && new) {
 		/* New group - use object allocated before */
 		destroy = 0;
-		nr_related_thread_groups++;
 		list_add(&new->list, &related_thread_groups);
 		grp = new;
 	}
@@ -3550,8 +4048,10 @@ redo:
 done:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-	if (destroy)
+	if (new && destroy) {
+		free_group_cputime(new);
 		kfree(new);
+	}
 
 	return rc;
 }
@@ -3898,13 +4398,19 @@ static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
 			     struct task_struct *p)
 {
 	struct migration_notify_data mnd;
+	bool check_groups;
+
+	rcu_read_lock();
+	check_groups = rcu_access_pointer(p->grp) != NULL;
+	rcu_read_unlock();
 
 	if (!same_freq_domain(src_cpu, dest_cpu)) {
 		if (!src_cpu_dead)
-			check_for_freq_change(cpu_rq(src_cpu), false);
-		check_for_freq_change(cpu_rq(dest_cpu), false);
+			check_for_freq_change(cpu_rq(src_cpu), false,
+					      check_groups);
+		check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
 	} else {
-		check_for_freq_change(cpu_rq(dest_cpu), true);
+		check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
 	}
 
 	if (task_notify_on_migrate(p)) {
@@ -4771,6 +5277,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	struct related_thread_group *grp = NULL;
 #endif
 	bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+	bool check_group = false;
 
 	wake_flags &= ~WF_NO_NOTIFIER;
 
@@ -4846,6 +5353,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	if (update_preferred_cluster(grp, p, old_load))
 		set_preferred_cluster(grp);
 	rcu_read_unlock();
+	check_group = grp != NULL;
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
@@ -4894,12 +5402,14 @@ out:
 
 	if (freq_notif_allowed) {
 		if (!same_freq_domain(src_cpu, cpu)) {
-			check_for_freq_change(cpu_rq(cpu), false);
-			check_for_freq_change(cpu_rq(src_cpu), false);
+			check_for_freq_change(cpu_rq(cpu),
+						false, check_group);
+			check_for_freq_change(cpu_rq(src_cpu),
+						false, check_group);
 		} else if (heavy_task) {
-			check_for_freq_change(cpu_rq(cpu), false);
+			check_for_freq_change(cpu_rq(cpu), false, false);
 		} else if (success) {
-			check_for_freq_change(cpu_rq(cpu), true);
+			check_for_freq_change(cpu_rq(cpu), true, false);
 		}
 	}
 
@@ -10543,6 +11053,7 @@ void __init sched_init(void)
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
 		rq->old_busy_time = 0;
 		rq->old_estimated_time = 0;
+		rq->old_busy_time_group = 0;
 		rq->notifier_sent = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0288a331e311..a33eddb7b17d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
 #include <linux/task_work.h>
 #include <linux/ratelimit.h>
 
-#include <trace/events/sched.h>
-
 #include "sched.h"
+#include <trace/events/sched.h>
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -4059,6 +4058,9 @@ static inline int invalid_value_freq_input(unsigned int *data)
 	if (data == &sysctl_sched_freq_account_wait_time)
 		return !(*data == 0 || *data == 1);
 
+	if (data == &sysctl_sched_freq_aggregate)
+		return !(*data == 0 || *data == 1);
+
 	return 0;
 }
 #else
@@ -7674,6 +7676,7 @@ enum fbq_type { regular, remote, all };
 				LBF_BIG_TASK_ACTIVE_BALANCE)
 #define LBF_IGNORE_BIG_TASKS 0x100
 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7916,6 +7919,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 	deactivate_task(env->src_rq, p, 0);
 	double_lock_balance(env->src_rq, env->dst_rq);
 	set_task_cpu(p, env->dst_cpu);
+	if (rcu_access_pointer(p->grp))
+		env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
 	double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
@@ -9575,10 +9580,13 @@ no_move:
 
 		/* Assumes one 'busiest' cpu that we pulled tasks from */
 		if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
-			check_for_freq_change(this_rq, false);
-			check_for_freq_change(busiest, false);
+			int check_groups = !!(env.flags &
+					 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+			check_for_freq_change(this_rq, false, check_groups);
+			check_for_freq_change(busiest, false, check_groups);
 		} else {
-			check_for_freq_change(this_rq, true);
+			check_for_freq_change(this_rq, true, false);
 		}
 	}
 	if (likely(!active_balance)) {
@@ -9876,10 +9884,12 @@ out_unlock:
 	local_irq_enable();
 
 	if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
-		check_for_freq_change(busiest_rq, false);
-		check_for_freq_change(target_rq, false);
+		int check_groups = !!(env.flags &
+					 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+		check_for_freq_change(busiest_rq, false, check_groups);
+		check_for_freq_change(target_rq, false, check_groups);
 	} else if (moved) {
-		check_for_freq_change(target_rq, true);
+		check_for_freq_change(target_rq, true, false);
 	}
 
 	if (per_cpu(dbs_boost_needed, target_cpu)) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a66d8a12051c..df9b972195e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,16 @@ struct related_thread_group {
 	struct sched_cluster *preferred_cluster;
 	struct rcu_head rcu;
 	u64 last_update;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	struct group_cpu_time __percpu *cpu_time;	/* one per cluster */
+#endif
+};
+
+struct migration_sum_data {
+	struct rq *src_rq, *dst_rq;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+	struct group_cpu_time *src_cpu_time, *dst_cpu_time;
+#endif
 };
 
 extern struct list_head cluster_head;
@@ -741,7 +751,7 @@ struct rq {
 	struct task_struct *ed_task;
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
-	unsigned int old_busy_time;
+	u64 old_busy_time, old_busy_time_group;
 	int notifier_sent;
 	u64 old_estimated_time;
 #endif
@@ -1337,7 +1347,16 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
 #ifdef CONFIG_SCHED_FREQ_INPUT
 #define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
 
-extern void check_for_freq_change(struct rq *rq, bool check_cra);
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+struct group_cpu_time {
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+	u64 window_start;
+};
 
 /* Is frequency of two cpus synchronized with each other? */
 static inline int same_freq_domain(int src_cpu, int dst_cpu)
@@ -1355,7 +1374,8 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
 #define sched_migration_fixup	0
 #define PRED_DEMAND_DELTA (0)
 
-static inline void check_for_freq_change(struct rq *rq, bool check_cra) { }
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
 
 static inline int same_freq_domain(int src_cpu, int dst_cpu)
 {
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index cdb1d7c53849..c70e0466c36c 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -18,9 +18,9 @@
 #include <linux/hrtimer.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
-#include <trace/events/sched.h>
 
 #include "sched.h"
+#include <trace/events/sched.h>
 
 static DEFINE_PER_CPU(u64, nr_prod_sum);
 static DEFINE_PER_CPU(u64, last_time);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1da3b96368b1..825be75ca1a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -472,6 +472,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
+	{
+		.procname       = "sched_freq_aggregate",
+		.data           = &sysctl_sched_freq_aggregate,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_window_update_handler,
+	},
 #endif
 	{
 		.procname	= "sched_boost",