7 files changed, 317 insertions, 295 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..48f45987dc6c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/compat.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <uapi/linux/limits.h>
 
 #include "audit.h"
@@ -82,7 +83,8 @@
 #define AUDITSC_SUCCESS 1
 #define AUDITSC_FAILURE 2
 
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
 /* max length to print of cmdline/proctitle value during audit */
@@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
-/*
- * to_send and len_sent accounting are very loose estimates.  We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf?  an int is up to 12 digits long.  if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message.  In one 7500 byte message we can log up to
- * about 1000 min size arguments.  That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
-					struct audit_buffer **ab,
-					int arg_num,
-					size_t *len_sent,
-					const char __user *p,
-					char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+				  struct audit_buffer **ab)
 {
-	char arg_num_len_buf[12];
-	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 5 is the length of ' a=""' */
-	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
-	size_t len, len_left, to_send;
-	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
-	unsigned int i, has_cntl = 0, too_long = 0;
-	int ret;
-
-	/* strnlen_user includes the null we don't want to send */
-	len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
-	/*
-	 * We just created this mm, if we can't find the strings
-	 * we just copied into it something is _very_ wrong. Similar
-	 * for strings that are too long, we should not have created
-	 * any.
-	 */
-	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
-		send_sig(SIGKILL, current, 0);
-		return -1;
+	long len_max;
+	long len_rem;
+	long len_full;
+	long len_buf;
+	long len_abuf;
+	long len_tmp;
+	bool require_data;
+	bool encode;
+	unsigned int iter;
+	unsigned int arg;
+	char *buf_head;
+	char *buf;
+	const char __user *p = (const char __user *)current->mm->arg_start;
+
+	/* NOTE: this buffer needs to be large enough to hold all the non-arg
+	 *       data we put in the audit record for this argument (see the
+	 *       code below) ... at this point in time 96 is plenty */
+	char abuf[96];
+
+	/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+	 *       current value of 7500 is not as important as the fact that it
+	 *       is less than 8k, a setting of 7500 gives us plenty of wiggle
+	 *       room if we go over a little bit in the logging below */
+	WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+	len_max = MAX_EXECVE_AUDIT_LEN;
+
+	/* scratch buffer to hold the userspace args */
+	buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+	if (!buf_head) {
+		audit_panic("out of memory for argv string");
+		return;
 	}
+	buf = buf_head;
 
-	/* walk the whole argument looking for non-ascii chars */
+	audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+	len_rem = len_max;
+	len_buf = 0;
+	len_full = 0;
+	require_data = true;
+	encode = false;
+	iter = 0;
+	arg = 0;
 	do {
-		if (len_left > MAX_EXECVE_AUDIT_LEN)
-			to_send = MAX_EXECVE_AUDIT_LEN;
-		else
-			to_send = len_left;
-		ret = copy_from_user(buf, tmp_p, to_send);
-		/*
-		 * There is no reason for this copy to be short. We just
-		 * copied them here, and the mm hasn't been exposed to user-
-		 * space yet.
-		 */
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
-		}
-		buf[to_send] = '\0';
-		has_cntl = audit_string_contains_control(buf, to_send);
-		if (has_cntl) {
-			/*
-			 * hex messages get logged as 2 bytes, so we can only
-			 * send half as much in each message
-			 */
-			max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
-			break;
-		}
-		len_left -= to_send;
-		tmp_p += to_send;
-	} while (len_left > 0);
-
-	len_left = len;
-
-	if (len > max_execve_audit_len)
-		too_long = 1;
-
-	/* rewalk the argument actually logging the message */
-	for (i = 0; len_left > 0; i++) {
-		int room_left;
-
-		if (len_left > max_execve_audit_len)
-			to_send = max_execve_audit_len;
-		else
-			to_send = len_left;
-
-		/* do we have space left to send this argument in this ab? */
-		room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
-		if (has_cntl)
-			room_left -= (to_send * 2);
-		else
-			room_left -= to_send;
-		if (room_left < 0) {
-			*len_sent = 0;
-			audit_log_end(*ab);
-			*ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
-			if (!*ab)
-				return 0;
-		}
+		/* NOTE: we don't ever want to trust this value for anything
+		 *       serious, but the audit record format insists we
+		 *       provide an argument length for really long arguments,
+		 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+		 *       to use strncpy_from_user() to obtain this value for
+		 *       recording in the log, although we don't use it
+		 *       anywhere here to avoid a double-fetch problem */
+		if (len_full == 0)
+			len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+		/* read more data from userspace */
+		if (require_data) {
+			/* can we make more room in the buffer? */
+			if (buf != buf_head) {
+				memmove(buf_head, buf, len_buf);
+				buf = buf_head;
+			}
+
+			/* fetch as much as we can of the argument */
+			len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+						    len_max - len_buf);
+			if (len_tmp == -EFAULT) {
+				/* unable to copy from userspace */
+				send_sig(SIGKILL, current, 0);
+				goto out;
+			} else if (len_tmp == (len_max - len_buf)) {
+				/* buffer is not large enough */
+				require_data = true;
+				/* NOTE: if we are going to span multiple
+				 *       buffers force the encoding so we stand
+				 *       a chance at a sane len_full value and
+				 *       consistent record encoding */
+				encode = true;
+				len_full = len_full * 2;
+				p += len_tmp;
+			} else {
+				require_data = false;
+				if (!encode)
+					encode = audit_string_contains_control(
+								buf, len_tmp);
+				/* try to use a trusted value for len_full */
+				if (len_full < len_max)
+					len_full = (encode ?
+						    len_tmp * 2 : len_tmp);
+				p += len_tmp + 1;
+			}
+			len_buf += len_tmp;
+			buf_head[len_buf] = '\0';
 
-		/*
-		 * first record needs to say how long the original string was
-		 * so we can be sure nothing was lost.
-		 */
-		if ((i == 0) && (too_long))
-			audit_log_format(*ab, " a%d_len=%zu", arg_num,
-					 has_cntl ? 2*len : len);
-
-		/*
-		 * normally arguments are small enough to fit and we already
-		 * filled buf above when we checked for control characters
-		 * so don't bother with another copy_from_user
-		 */
-		if (len >= max_execve_audit_len)
-			ret = copy_from_user(buf, p, to_send);
-		else
-			ret = 0;
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
+			/* length of the buffer in the audit record? */
+			len_abuf = (encode ? len_buf * 2 : len_buf + 2);
 		}
-		buf[to_send] = '\0';
-
-		/* actually log it */
-		audit_log_format(*ab, " a%d", arg_num);
-		if (too_long)
-			audit_log_format(*ab, "[%d]", i);
-		audit_log_format(*ab, "=");
-		if (has_cntl)
-			audit_log_n_hex(*ab, buf, to_send);
-		else
-			audit_log_string(*ab, buf);
-
-		p += to_send;
-		len_left -= to_send;
-		*len_sent += arg_num_len;
-		if (has_cntl)
-			*len_sent += to_send * 2;
-		else
-			*len_sent += to_send;
-	}
-	/* include the null we didn't log */
-	return len + 1;
-}
 
-static void audit_log_execve_info(struct audit_context *context,
-				  struct audit_buffer **ab)
-{
-	int i, len;
-	size_t len_sent = 0;
-	const char __user *p;
-	char *buf;
+		/* write as much as we can to the audit log */
+		if (len_buf > 0) {
+			/* NOTE: some magic numbers here - basically if we
+			 *       can't fit a reasonable amount of data into the
+			 *       existing audit buffer, flush it and start with
+			 *       a new buffer */
+			if ((sizeof(abuf) + 8) > len_rem) {
+				len_rem = len_max;
+				audit_log_end(*ab);
+				*ab = audit_log_start(context,
+						      GFP_KERNEL, AUDIT_EXECVE);
+				if (!*ab)
+					goto out;
+			}
 
-	p = (const char __user *)current->mm->arg_start;
+			/* create the non-arg portion of the arg record */
+			len_tmp = 0;
+			if (require_data || (iter > 0) ||
+			    ((len_abuf + sizeof(abuf)) > len_rem)) {
+				if (iter == 0) {
+					len_tmp += snprintf(&abuf[len_tmp],
+							sizeof(abuf) - len_tmp,
+							" a%d_len=%lu",
+							arg, len_full);
+				}
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d[%d]=", arg, iter++);
+			} else
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d=", arg);
+			WARN_ON(len_tmp >= sizeof(abuf));
+			abuf[sizeof(abuf) - 1] = '\0';
+
+			/* log the arg in the audit record */
+			audit_log_format(*ab, "%s", abuf);
+			len_rem -= len_tmp;
+			len_tmp = len_buf;
+			if (encode) {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem / 2; /* encoding */
+				audit_log_n_hex(*ab, buf, len_tmp);
+				len_rem -= len_tmp * 2;
+				len_abuf -= len_tmp * 2;
+			} else {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem - 2; /* quotes */
+				audit_log_n_string(*ab, buf, len_tmp);
+				len_rem -= len_tmp + 2;
+				/* don't subtract the "2" because we still need
+				 * to add quotes to the remaining string */
+				len_abuf -= len_tmp;
+			}
+			len_buf -= len_tmp;
+			buf += len_tmp;
+		}
 
-	audit_log_format(*ab, "argc=%d", context->execve.argc);
+		/* ready to move to the next argument? */
+		if ((len_buf == 0) && !require_data) {
+			arg++;
+			iter = 0;
+			len_full = 0;
+			require_data = true;
+			encode = false;
+		}
+	} while (arg < context->execve.argc);
 
-	/*
-	 * we need some kernel buffer to hold the userspace args.  Just
-	 * allocate one big one rather than allocating one of the right size
-	 * for every single argument inside audit_log_single_execve_arg()
-	 * should be <8k allocation so should be pretty safe.
-	 */
-	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
-	if (!buf) {
-		audit_panic("out of memory for argv string");
-		return;
-	}
+	/* NOTE: the caller handles the final audit_log_end() call */
 
-	for (i = 0; i < context->execve.argc; i++) {
-		len = audit_log_single_execve_arg(context, ab, i,
-						  &len_sent, p, buf);
-		if (len <= 0)
-			break;
-		p += len;
-	}
-	kfree(buf);
+out:
+	kfree(buf_head);
 }
 
 static void show_special(struct audit_context *context, int *call_panic)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae83d9602aa0..8c9823947c7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6002,7 +6002,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 		struct task_struct *task;
 		int count = 0;
 
-		seq_printf(seq, "css_set %p\n", cset);
+		seq_printf(seq, "css_set %pK\n", cset);
 
 		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 69c32c42080f..582b66e882ce 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -358,7 +358,11 @@ int pm_qos_update_target(struct pm_qos_constraints *c,
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	trace_pm_qos_update_target(action, prev_value, curr_value);
-	if (prev_value != curr_value) {
+	/*
+	 * if cpu mask bits are set, call the notifier call chain
+	 * to update the new qos restriction for the cores
+	 */
+	if (!cpumask_empty(&cpus)) {
 		ret = 1;
 		if (c->notifiers)
 			blocking_notifier_call_chain(c->notifiers,
@@ -592,7 +596,6 @@ void pm_qos_add_request(struct pm_qos_request *req,
 #ifdef CONFIG_SMP
 	case PM_QOS_REQ_AFFINE_IRQ:
 		if (irq_can_set_affinity(req->irq)) {
-			int ret = 0;
 			struct irq_desc *desc = irq_to_desc(req->irq);
 			struct cpumask *mask = desc->irq_data.common->affinity;
 
@@ -602,13 +605,6 @@ void pm_qos_add_request(struct pm_qos_request *req,
 			req->irq_notify.notify = pm_qos_irq_notify;
 			req->irq_notify.release = pm_qos_irq_release;
 
-			ret = irq_set_affinity_notifier(req->irq,
-					&req->irq_notify);
-			if (ret) {
-				WARN(1, KERN_ERR "IRQ affinity notify set failed\n");
-				req->type = PM_QOS_REQ_ALL_CORES;
-				cpumask_setall(&req->cpus_affine);
-			}
 		} else {
 			req->type = PM_QOS_REQ_ALL_CORES;
 			cpumask_setall(&req->cpus_affine);
@@ -630,6 +626,24 @@ void pm_qos_add_request(struct pm_qos_request *req,
 	trace_pm_qos_add_request(pm_qos_class, value);
 	pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
 			     req, PM_QOS_ADD_REQ, value);
+
+#ifdef CONFIG_SMP
+	if (req->type == PM_QOS_REQ_AFFINE_IRQ &&
+			irq_can_set_affinity(req->irq)) {
+		int ret = 0;
+
+		ret = irq_set_affinity_notifier(req->irq,
+					&req->irq_notify);
+		if (ret) {
+			WARN(1, "IRQ affinity notify set failed\n");
+			req->type = PM_QOS_REQ_ALL_CORES;
+			cpumask_setall(&req->cpus_affine);
+			pm_qos_update_target(
+				pm_qos_array[pm_qos_class]->constraints,
+				req, PM_QOS_UPDATE_REQ, value);
+		}
+	}
+#endif
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5d101e8a5f2..d7846edd7a79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5600,7 +5600,6 @@ int do_isolation_work_cpu_stop(void *data)
 	 */
 	nohz_balance_clear_nohz_mask(cpu);
 
-	clear_hmp_request(cpu);
 	local_irq_enable();
 	return 0;
 }
@@ -5682,7 +5681,7 @@ int sched_isolate_cpu(int cpu)
 	if (trace_sched_isolate_enabled())
 		start_time = sched_clock();
 
-	lock_device_hotplug();
+	cpu_maps_update_begin();
 
 	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
 
@@ -5725,13 +5724,14 @@ int sched_isolate_cpu(int cpu)
 
 	migrate_sync_cpu(cpu, cpumask_first(&avail_cpus));
 	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+	clear_hmp_request(cpu);
 
 	calc_load_migrate(rq);
 	update_max_interval();
 	sched_update_group_capacities(cpu);
 
 out:
-	unlock_device_hotplug();
+	cpu_maps_update_done();
 	trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
 			    start_time, 1);
 	return ret_code;
@@ -5752,8 +5752,6 @@ int sched_unisolate_cpu_unlocked(int cpu)
 	if (trace_sched_isolate_enabled())
 		start_time = sched_clock();
 
-	lock_device_hotplug_assert();
-
 	if (!cpu_isolation_vote[cpu]) {
 		ret_code = -EINVAL;
 		goto out;
@@ -5792,9 +5790,9 @@ int sched_unisolate_cpu(int cpu)
 {
 	int ret_code;
 
-	lock_device_hotplug();
+	cpu_maps_update_begin();
 	ret_code = sched_unisolate_cpu_unlocked(cpu);
-	unlock_device_hotplug();
+	cpu_maps_update_done();
 	return ret_code;
 }
 
@@ -8073,6 +8071,9 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
+	i = alloc_related_thread_groups();
+	BUG_ON(i);
+
 	set_hmp_defaults();
 
 	set_load_weight(&init_task);
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index 9b21a09ec4ba..aac12bfc2ae6 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -893,14 +893,10 @@ static int __ref cpu_callback(struct notifier_block *nfb,
 	unsigned int need;
 	int ret = NOTIFY_OK;
 
-	/* Don't affect suspend resume */
-	if (action & CPU_TASKS_FROZEN)
-		return NOTIFY_OK;
-
 	if (unlikely(!cluster || !cluster->inited))
 		return NOTIFY_OK;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 
 		/* If online state of CPU somehow got out of sync, fix it. */
@@ -1095,7 +1091,7 @@ static int __init core_ctl_init(void)
 	cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
 
-	lock_device_hotplug();
+	cpu_maps_update_begin();
 	for_each_online_cpu(cpu) {
 		struct cpufreq_policy *policy;
 		int ret;
@@ -1109,7 +1105,7 @@ static int __init core_ctl_init(void)
 			cpufreq_cpu_put(policy);
 		}
 	}
-	unlock_device_hotplug();
+	cpu_maps_update_done();
 	initialized = true;
 	return 0;
 }
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 968a41e0e81e..6304c5030137 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -641,14 +641,18 @@ void clear_hmp_request(int cpu)
 	clear_boost_kick(cpu);
 	clear_reserved(cpu);
 	if (rq->push_task) {
+		struct task_struct *push_task = NULL;
+
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->push_task) {
 			clear_reserved(rq->push_cpu);
-			put_task_struct(rq->push_task);
+			push_task = rq->push_task;
 			rq->push_task = NULL;
 		}
 		rq->active_balance = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		if (push_task)
+			put_task_struct(push_task);
 	}
 }
 
@@ -784,11 +788,12 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
 
 static unsigned int sync_cpu;
 
-static LIST_HEAD(related_thread_groups);
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
 static DEFINE_RWLOCK(related_thread_group_lock);
 
 #define for_each_related_thread_group(grp) \
-	list_for_each_entry(grp, &related_thread_groups, list)
+	list_for_each_entry(grp, &active_related_thread_groups, list)
 
 /*
  * Task load is categorized into buckets for the purpose of top task tracking.
@@ -1767,20 +1772,20 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups)
 		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
 			return 0;
 	} else {
-		read_lock(&related_thread_group_lock);
+		read_lock_irqsave(&related_thread_group_lock, flags);
 		/*
 		 * Protect from concurrent update of rq->prev_runnable_sum and
 		 * group cpu load
 		 */
-		raw_spin_lock_irqsave(&rq->lock, flags);
+		raw_spin_lock(&rq->lock);
 		if (check_groups)
 			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
 
 		new_load = rq->prev_runnable_sum + group_load;
 		new_load = freq_policy_load(rq, new_load);
 
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		read_unlock(&related_thread_group_lock);
+		raw_spin_unlock(&rq->lock);
+		read_unlock_irqrestore(&related_thread_group_lock, flags);
 
 		cur_freq = load_to_freq(rq, rq->old_busy_time);
 		freq_required = load_to_freq(rq, new_load);
@@ -3052,7 +3057,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 	read_unlock(&tasklist_lock);
 
-	list_for_each_entry(grp, &related_thread_groups, list) {
+	list_for_each_entry(grp, &active_related_thread_groups, list) {
 		int j;
 
 		for_each_possible_cpu(j) {
@@ -3202,14 +3207,16 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	if (unlikely(cpus == 0))
 		return;
 
+	local_irq_save(flags);
+
+	read_lock(&related_thread_group_lock);
+
 	/*
 	 * This function could be called in timer context, and the
 	 * current task may have been executing for a long time. Ensure
 	 * that the window stats are current by doing an update.
 	 */
-	read_lock(&related_thread_group_lock);
 
-	local_irq_save(flags);
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_lock(&cpu_rq(cpu)->lock);
 
@@ -3309,10 +3316,11 @@ skip_early:
 
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_unlock(&(cpu_rq(cpu))->lock);
-	local_irq_restore(flags);
 
 	read_unlock(&related_thread_group_lock);
 
+	local_irq_restore(flags);
+
 	i = 0;
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
@@ -3965,47 +3973,54 @@ _group_cpu_time(struct related_thread_group *grp, int cpu)
 	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
 }
 
-struct related_thread_group *alloc_related_thread_group(int group_id)
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
 {
-	struct related_thread_group *grp;
-
-	grp = kzalloc(sizeof(*grp), GFP_ATOMIC);
-	if (!grp)
-		return ERR_PTR(-ENOMEM);
-
-	if (alloc_group_cputime(grp)) {
-		kfree(grp);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	grp->id = group_id;
-	INIT_LIST_HEAD(&grp->tasks);
-	INIT_LIST_HEAD(&grp->list);
-	raw_spin_lock_init(&grp->lock);
-
-	return grp;
+	return related_thread_groups[group_id];
 }
 
-struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+int alloc_related_thread_groups(void)
 {
+	int i, ret;
 	struct related_thread_group *grp;
 
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		if (grp->id == group_id)
-			return grp;
+	/* groupd_id = 0 is invalid as it's special id to remove group. */
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+		if (!grp) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (alloc_group_cputime(grp)) {
+			kfree(grp);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		grp->id = i;
+		INIT_LIST_HEAD(&grp->tasks);
+		INIT_LIST_HEAD(&grp->list);
+		raw_spin_lock_init(&grp->lock);
+
+		related_thread_groups[i] = grp;
 	}
 
-	return NULL;
-}
+	return 0;
 
-/* See comments before preferred_cluster() */
-static void free_related_thread_group(struct rcu_head *rcu)
-{
-	struct related_thread_group *grp = container_of(rcu, struct
-			related_thread_group, rcu);
+err:
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = lookup_related_thread_group(i);
+		if (grp) {
+			free_group_cputime(grp);
+			kfree(grp);
+			related_thread_groups[i] = NULL;
+		} else {
+			break;
+		}
+	}
 
-	free_group_cputime(grp);
-	kfree(grp);
+	return ret;
 }
 
 static void remove_task_from_group(struct task_struct *p)
@@ -4030,10 +4045,12 @@ static void remove_task_from_group(struct task_struct *p)
 	raw_spin_unlock(&grp->lock);
 
 	/* Reserved groups cannot be destroyed */
-	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) {
-		list_del(&grp->list);
-		call_rcu(&grp->rcu, free_related_thread_group);
-	}
+	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+		 /*
+		  * We test whether grp->list is attached with list_empty()
+		  * hence re-init the list after deletion.
+		  */
+		list_del_init(&grp->list);
 }
 
 static int
@@ -4105,53 +4122,15 @@ void add_new_task_to_grp(struct task_struct *new)
 	write_unlock_irqrestore(&related_thread_group_lock, flags);
 }
 
-#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
-/*
- * We create a default colocation group at boot. There is no need to
- * synchronize tasks between cgroups at creation time because the
- * correct cgroup hierarchy is not available at boot. Therefore cgroup
- * colocation is turned off by default even though the colocation group
- * itself has been allocated. Furthermore this colocation group cannot
- * be destroyted once it has been created. All of this has been as part
- * of runtime optimizations.
- *
- * The job of synchronizing tasks to the colocation group is done when
- * the colocation flag in the cgroup is turned on.
- */
-static int __init create_default_coloc_group(void)
-{
-	struct related_thread_group *grp = NULL;
-	unsigned long flags;
-
-	grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
-	if (IS_ERR(grp)) {
-		WARN_ON(1);
-		return -ENOMEM;
-	}
-
-	write_lock_irqsave(&related_thread_group_lock, flags);
-	list_add(&grp->list, &related_thread_groups);
-	write_unlock_irqrestore(&related_thread_group_lock, flags);
-
-	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
-	return 0;
-}
-late_initcall(create_default_coloc_group);
-
-int sync_cgroup_colocation(struct task_struct *p, bool insert)
-{
-	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
-
-	return sched_set_group_id(p, grp_id);
-}
-#endif
-
-int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
 {
 	int rc = 0;
 	unsigned long flags;
 	struct related_thread_group *grp = NULL;
 
+	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+		return -EINVAL;
+
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	write_lock(&related_thread_group_lock);
 
@@ -4167,29 +4146,26 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
 	}
 
 	grp = lookup_related_thread_group(group_id);
-	if (!grp) {
-		/* This is a reserved id */
-		if (group_id == DEFAULT_CGROUP_COLOC_ID) {
-			rc = -EINVAL;
-			goto done;
-		}
-
-		grp = alloc_related_thread_group(group_id);
-		if (IS_ERR(grp)) {
-			rc = -ENOMEM;
-			goto done;
-		}
-
-		list_add(&grp->list, &related_thread_groups);
-	}
+	if (list_empty(&grp->list))
+		list_add(&grp->list, &active_related_thread_groups);
 
 	rc = add_task_to_group(p, grp);
 done:
 	write_unlock(&related_thread_group_lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
 	return rc;
 }
 
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+	if (group_id == DEFAULT_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	return __sched_set_group_id(p, group_id);
+}
+
 unsigned int sched_get_group_id(struct task_struct *p)
 {
 	unsigned int group_id;
@@ -4203,6 +4179,42 @@ unsigned int sched_get_group_id(struct task_struct *p)
 	return group_id;
 }
 
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+	struct related_thread_group *grp = NULL;
+	unsigned long flags;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+	list_add(&grp->list, &active_related_thread_groups);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+	return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+	return __sched_set_group_id(p, grp_id);
+}
+#endif
+
 static void update_cpu_cluster_capacity(const cpumask_t *cpus)
 {
 	int i;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30838bb9b442..f569c6fe3cbb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1448,6 +1448,8 @@ static inline void update_cgroup_boost_settings(void) { }
 static inline void restore_cgroup_boost_settings(void) { }
 #endif
 
+extern int alloc_related_thread_groups(void);
+
 #else	/* CONFIG_SCHED_HMP */
 
 struct hmp_sched_stats;
@@ -1638,6 +1640,7 @@ static inline void set_hmp_defaults(void) { }
 
 static inline void clear_reserved(int cpu) { }
 static inline void sched_boost_parse_dt(void) {}
+static inline int alloc_related_thread_groups(void) { return 0; }
 
 #define trace_sched_cpu_load(...)
 #define trace_sched_cpu_load_lb(...)