summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c332
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/power/qos.c32
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/core_ctl.c10
-rw-r--r--kernel/sched/hmp.c218
-rw-r--r--kernel/sched/sched.h3
7 files changed, 317 insertions, 295 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..48f45987dc6c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <uapi/linux/limits.h>
#include "audit.h"
@@ -82,7 +83,8 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
@@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
return rc;
}
-/*
- * to_send and len_sent accounting are very loose estimates. We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf? an int is up to 12 digits long. if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message. In one 7500 byte message we can log up to
- * about 1000 min size arguments. That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
- struct audit_buffer **ab,
- int arg_num,
- size_t *len_sent,
- const char __user *p,
- char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
{
- char arg_num_len_buf[12];
- const char __user *tmp_p = p;
- /* how many digits are in arg_num? 5 is the length of ' a=""' */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
- size_t len, len_left, to_send;
- size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
- unsigned int i, has_cntl = 0, too_long = 0;
- int ret;
-
- /* strnlen_user includes the null we don't want to send */
- len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
- /*
- * We just created this mm, if we can't find the strings
- * we just copied into it something is _very_ wrong. Similar
- * for strings that are too long, we should not have created
- * any.
- */
- if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
- send_sig(SIGKILL, current, 0);
- return -1;
+ long len_max;
+ long len_rem;
+ long len_full;
+ long len_buf;
+ long len_abuf;
+ long len_tmp;
+ bool require_data;
+ bool encode;
+ unsigned int iter;
+ unsigned int arg;
+ char *buf_head;
+ char *buf;
+ const char __user *p = (const char __user *)current->mm->arg_start;
+
+ /* NOTE: this buffer needs to be large enough to hold all the non-arg
+ * data we put in the audit record for this argument (see the
+ * code below) ... at this point in time 96 is plenty */
+ char abuf[96];
+
+ /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+ * current value of 7500 is not as important as the fact that it
+ * is less than 8k, a setting of 7500 gives us plenty of wiggle
+ * room if we go over a little bit in the logging below */
+ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+ len_max = MAX_EXECVE_AUDIT_LEN;
+
+ /* scratch buffer to hold the userspace args */
+ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf_head) {
+ audit_panic("out of memory for argv string");
+ return;
}
+ buf = buf_head;
- /* walk the whole argument looking for non-ascii chars */
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+ len_rem = len_max;
+ len_buf = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ iter = 0;
+ arg = 0;
do {
- if (len_left > MAX_EXECVE_AUDIT_LEN)
- to_send = MAX_EXECVE_AUDIT_LEN;
- else
- to_send = len_left;
- ret = copy_from_user(buf, tmp_p, to_send);
- /*
- * There is no reason for this copy to be short. We just
- * copied them here, and the mm hasn't been exposed to user-
- * space yet.
- */
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
- }
- buf[to_send] = '\0';
- has_cntl = audit_string_contains_control(buf, to_send);
- if (has_cntl) {
- /*
- * hex messages get logged as 2 bytes, so we can only
- * send half as much in each message
- */
- max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
- break;
- }
- len_left -= to_send;
- tmp_p += to_send;
- } while (len_left > 0);
-
- len_left = len;
-
- if (len > max_execve_audit_len)
- too_long = 1;
-
- /* rewalk the argument actually logging the message */
- for (i = 0; len_left > 0; i++) {
- int room_left;
-
- if (len_left > max_execve_audit_len)
- to_send = max_execve_audit_len;
- else
- to_send = len_left;
-
- /* do we have space left to send this argument in this ab? */
- room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
- if (has_cntl)
- room_left -= (to_send * 2);
- else
- room_left -= to_send;
- if (room_left < 0) {
- *len_sent = 0;
- audit_log_end(*ab);
- *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
- if (!*ab)
- return 0;
- }
+ /* NOTE: we don't ever want to trust this value for anything
+ * serious, but the audit record format insists we
+ * provide an argument length for really long arguments,
+ * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+ * to use strncpy_from_user() to obtain this value for
+ * recording in the log, although we don't use it
+ * anywhere here to avoid a double-fetch problem */
+ if (len_full == 0)
+ len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /* read more data from userspace */
+ if (require_data) {
+ /* can we make more room in the buffer? */
+ if (buf != buf_head) {
+ memmove(buf_head, buf, len_buf);
+ buf = buf_head;
+ }
+
+ /* fetch as much as we can of the argument */
+ len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+ len_max - len_buf);
+ if (len_tmp == -EFAULT) {
+ /* unable to copy from userspace */
+ send_sig(SIGKILL, current, 0);
+ goto out;
+ } else if (len_tmp == (len_max - len_buf)) {
+ /* buffer is not large enough */
+ require_data = true;
+ /* NOTE: if we are going to span multiple
+ * buffers force the encoding so we stand
+ * a chance at a sane len_full value and
+ * consistent record encoding */
+ encode = true;
+ len_full = len_full * 2;
+ p += len_tmp;
+ } else {
+ require_data = false;
+ if (!encode)
+ encode = audit_string_contains_control(
+ buf, len_tmp);
+ /* try to use a trusted value for len_full */
+ if (len_full < len_max)
+ len_full = (encode ?
+ len_tmp * 2 : len_tmp);
+ p += len_tmp + 1;
+ }
+ len_buf += len_tmp;
+ buf_head[len_buf] = '\0';
- /*
- * first record needs to say how long the original string was
- * so we can be sure nothing was lost.
- */
- if ((i == 0) && (too_long))
- audit_log_format(*ab, " a%d_len=%zu", arg_num,
- has_cntl ? 2*len : len);
-
- /*
- * normally arguments are small enough to fit and we already
- * filled buf above when we checked for control characters
- * so don't bother with another copy_from_user
- */
- if (len >= max_execve_audit_len)
- ret = copy_from_user(buf, p, to_send);
- else
- ret = 0;
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+ /* length of the buffer in the audit record? */
+ len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
- buf[to_send] = '\0';
-
- /* actually log it */
- audit_log_format(*ab, " a%d", arg_num);
- if (too_long)
- audit_log_format(*ab, "[%d]", i);
- audit_log_format(*ab, "=");
- if (has_cntl)
- audit_log_n_hex(*ab, buf, to_send);
- else
- audit_log_string(*ab, buf);
-
- p += to_send;
- len_left -= to_send;
- *len_sent += arg_num_len;
- if (has_cntl)
- *len_sent += to_send * 2;
- else
- *len_sent += to_send;
- }
- /* include the null we didn't log */
- return len + 1;
-}
-static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
-{
- int i, len;
- size_t len_sent = 0;
- const char __user *p;
- char *buf;
+ /* write as much as we can to the audit log */
+ if (len_buf > 0) {
+ /* NOTE: some magic numbers here - basically if we
+ * can't fit a reasonable amount of data into the
+ * existing audit buffer, flush it and start with
+ * a new buffer */
+ if ((sizeof(abuf) + 8) > len_rem) {
+ len_rem = len_max;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context,
+ GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ goto out;
+ }
- p = (const char __user *)current->mm->arg_start;
+ /* create the non-arg portion of the arg record */
+ len_tmp = 0;
+ if (require_data || (iter > 0) ||
+ ((len_abuf + sizeof(abuf)) > len_rem)) {
+ if (iter == 0) {
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d_len=%lu",
+ arg, len_full);
+ }
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d[%d]=", arg, iter++);
+ } else
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d=", arg);
+ WARN_ON(len_tmp >= sizeof(abuf));
+ abuf[sizeof(abuf) - 1] = '\0';
+
+ /* log the arg in the audit record */
+ audit_log_format(*ab, "%s", abuf);
+ len_rem -= len_tmp;
+ len_tmp = len_buf;
+ if (encode) {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem / 2; /* encoding */
+ audit_log_n_hex(*ab, buf, len_tmp);
+ len_rem -= len_tmp * 2;
+ len_abuf -= len_tmp * 2;
+ } else {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem - 2; /* quotes */
+ audit_log_n_string(*ab, buf, len_tmp);
+ len_rem -= len_tmp + 2;
+ /* don't subtract the "2" because we still need
+ * to add quotes to the remaining string */
+ len_abuf -= len_tmp;
+ }
+ len_buf -= len_tmp;
+ buf += len_tmp;
+ }
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+ /* ready to move to the next argument? */
+ if ((len_buf == 0) && !require_data) {
+ arg++;
+ iter = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ }
+ } while (arg < context->execve.argc);
- /*
- * we need some kernel buffer to hold the userspace args. Just
- * allocate one big one rather than allocating one of the right size
- * for every single argument inside audit_log_single_execve_arg()
- * should be <8k allocation so should be pretty safe.
- */
- buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
- if (!buf) {
- audit_panic("out of memory for argv string");
- return;
- }
+ /* NOTE: the caller handles the final audit_log_end() call */
- for (i = 0; i < context->execve.argc; i++) {
- len = audit_log_single_execve_arg(context, ab, i,
- &len_sent, p, buf);
- if (len <= 0)
- break;
- p += len;
- }
- kfree(buf);
+out:
+ kfree(buf_head);
}
static void show_special(struct audit_context *context, int *call_panic)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae83d9602aa0..8c9823947c7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6002,7 +6002,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cset);
+ seq_printf(seq, "css_set %pK\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 69c32c42080f..582b66e882ce 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -358,7 +358,11 @@ int pm_qos_update_target(struct pm_qos_constraints *c,
spin_unlock_irqrestore(&pm_qos_lock, flags);
trace_pm_qos_update_target(action, prev_value, curr_value);
- if (prev_value != curr_value) {
+ /*
+ * if cpu mask bits are set, call the notifier call chain
+ * to update the new qos restriction for the cores
+ */
+ if (!cpumask_empty(&cpus)) {
ret = 1;
if (c->notifiers)
blocking_notifier_call_chain(c->notifiers,
@@ -592,7 +596,6 @@ void pm_qos_add_request(struct pm_qos_request *req,
#ifdef CONFIG_SMP
case PM_QOS_REQ_AFFINE_IRQ:
if (irq_can_set_affinity(req->irq)) {
- int ret = 0;
struct irq_desc *desc = irq_to_desc(req->irq);
struct cpumask *mask = desc->irq_data.common->affinity;
@@ -602,13 +605,6 @@ void pm_qos_add_request(struct pm_qos_request *req,
req->irq_notify.notify = pm_qos_irq_notify;
req->irq_notify.release = pm_qos_irq_release;
- ret = irq_set_affinity_notifier(req->irq,
- &req->irq_notify);
- if (ret) {
- WARN(1, KERN_ERR "IRQ affinity notify set failed\n");
- req->type = PM_QOS_REQ_ALL_CORES;
- cpumask_setall(&req->cpus_affine);
- }
} else {
req->type = PM_QOS_REQ_ALL_CORES;
cpumask_setall(&req->cpus_affine);
@@ -630,6 +626,24 @@ void pm_qos_add_request(struct pm_qos_request *req,
trace_pm_qos_add_request(pm_qos_class, value);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
req, PM_QOS_ADD_REQ, value);
+
+#ifdef CONFIG_SMP
+ if (req->type == PM_QOS_REQ_AFFINE_IRQ &&
+ irq_can_set_affinity(req->irq)) {
+ int ret = 0;
+
+ ret = irq_set_affinity_notifier(req->irq,
+ &req->irq_notify);
+ if (ret) {
+ WARN(1, "IRQ affinity notify set failed\n");
+ req->type = PM_QOS_REQ_ALL_CORES;
+ cpumask_setall(&req->cpus_affine);
+ pm_qos_update_target(
+ pm_qos_array[pm_qos_class]->constraints,
+ req, PM_QOS_UPDATE_REQ, value);
+ }
+ }
+#endif
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5d101e8a5f2..d7846edd7a79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5600,7 +5600,6 @@ int do_isolation_work_cpu_stop(void *data)
*/
nohz_balance_clear_nohz_mask(cpu);
- clear_hmp_request(cpu);
local_irq_enable();
return 0;
}
@@ -5682,7 +5681,7 @@ int sched_isolate_cpu(int cpu)
if (trace_sched_isolate_enabled())
start_time = sched_clock();
- lock_device_hotplug();
+ cpu_maps_update_begin();
cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
@@ -5725,13 +5724,14 @@ int sched_isolate_cpu(int cpu)
migrate_sync_cpu(cpu, cpumask_first(&avail_cpus));
stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+ clear_hmp_request(cpu);
calc_load_migrate(rq);
update_max_interval();
sched_update_group_capacities(cpu);
out:
- unlock_device_hotplug();
+ cpu_maps_update_done();
trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
start_time, 1);
return ret_code;
@@ -5752,8 +5752,6 @@ int sched_unisolate_cpu_unlocked(int cpu)
if (trace_sched_isolate_enabled())
start_time = sched_clock();
- lock_device_hotplug_assert();
-
if (!cpu_isolation_vote[cpu]) {
ret_code = -EINVAL;
goto out;
@@ -5792,9 +5790,9 @@ int sched_unisolate_cpu(int cpu)
{
int ret_code;
- lock_device_hotplug();
+ cpu_maps_update_begin();
ret_code = sched_unisolate_cpu_unlocked(cpu);
- unlock_device_hotplug();
+ cpu_maps_update_done();
return ret_code;
}
@@ -8073,6 +8071,9 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
+ i = alloc_related_thread_groups();
+ BUG_ON(i);
+
set_hmp_defaults();
set_load_weight(&init_task);
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index 9b21a09ec4ba..aac12bfc2ae6 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -893,14 +893,10 @@ static int __ref cpu_callback(struct notifier_block *nfb,
unsigned int need;
int ret = NOTIFY_OK;
- /* Don't affect suspend resume */
- if (action & CPU_TASKS_FROZEN)
- return NOTIFY_OK;
-
if (unlikely(!cluster || !cluster->inited))
return NOTIFY_OK;
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
/* If online state of CPU somehow got out of sync, fix it. */
@@ -1095,7 +1091,7 @@ static int __init core_ctl_init(void)
cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
- lock_device_hotplug();
+ cpu_maps_update_begin();
for_each_online_cpu(cpu) {
struct cpufreq_policy *policy;
int ret;
@@ -1109,7 +1105,7 @@ static int __init core_ctl_init(void)
cpufreq_cpu_put(policy);
}
}
- unlock_device_hotplug();
+ cpu_maps_update_done();
initialized = true;
return 0;
}
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 968a41e0e81e..6304c5030137 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -641,14 +641,18 @@ void clear_hmp_request(int cpu)
clear_boost_kick(cpu);
clear_reserved(cpu);
if (rq->push_task) {
+ struct task_struct *push_task = NULL;
+
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->push_task) {
clear_reserved(rq->push_cpu);
- put_task_struct(rq->push_task);
+ push_task = rq->push_task;
rq->push_task = NULL;
}
rq->active_balance = 0;
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ if (push_task)
+ put_task_struct(push_task);
}
}
@@ -784,11 +788,12 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
static unsigned int sync_cpu;
-static LIST_HEAD(related_thread_groups);
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
static DEFINE_RWLOCK(related_thread_group_lock);
#define for_each_related_thread_group(grp) \
- list_for_each_entry(grp, &related_thread_groups, list)
+ list_for_each_entry(grp, &active_related_thread_groups, list)
/*
* Task load is categorized into buckets for the purpose of top task tracking.
@@ -1767,20 +1772,20 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups)
if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
return 0;
} else {
- read_lock(&related_thread_group_lock);
+ read_lock_irqsave(&related_thread_group_lock, flags);
/*
* Protect from concurrent update of rq->prev_runnable_sum and
* group cpu load
*/
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
if (check_groups)
_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
new_load = rq->prev_runnable_sum + group_load;
new_load = freq_policy_load(rq, new_load);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- read_unlock(&related_thread_group_lock);
+ raw_spin_unlock(&rq->lock);
+ read_unlock_irqrestore(&related_thread_group_lock, flags);
cur_freq = load_to_freq(rq, rq->old_busy_time);
freq_required = load_to_freq(rq, new_load);
@@ -3052,7 +3057,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
read_unlock(&tasklist_lock);
- list_for_each_entry(grp, &related_thread_groups, list) {
+ list_for_each_entry(grp, &active_related_thread_groups, list) {
int j;
for_each_possible_cpu(j) {
@@ -3202,14 +3207,16 @@ void sched_get_cpus_busy(struct sched_load *busy,
if (unlikely(cpus == 0))
return;
+ local_irq_save(flags);
+
+ read_lock(&related_thread_group_lock);
+
/*
* This function could be called in timer context, and the
* current task may have been executing for a long time. Ensure
* that the window stats are current by doing an update.
*/
- read_lock(&related_thread_group_lock);
- local_irq_save(flags);
for_each_cpu(cpu, query_cpus)
raw_spin_lock(&cpu_rq(cpu)->lock);
@@ -3309,10 +3316,11 @@ skip_early:
for_each_cpu(cpu, query_cpus)
raw_spin_unlock(&(cpu_rq(cpu))->lock);
- local_irq_restore(flags);
read_unlock(&related_thread_group_lock);
+ local_irq_restore(flags);
+
i = 0;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
@@ -3965,47 +3973,54 @@ _group_cpu_time(struct related_thread_group *grp, int cpu)
return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
}
-struct related_thread_group *alloc_related_thread_group(int group_id)
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
{
- struct related_thread_group *grp;
-
- grp = kzalloc(sizeof(*grp), GFP_ATOMIC);
- if (!grp)
- return ERR_PTR(-ENOMEM);
-
- if (alloc_group_cputime(grp)) {
- kfree(grp);
- return ERR_PTR(-ENOMEM);
- }
-
- grp->id = group_id;
- INIT_LIST_HEAD(&grp->tasks);
- INIT_LIST_HEAD(&grp->list);
- raw_spin_lock_init(&grp->lock);
-
- return grp;
+ return related_thread_groups[group_id];
}
-struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+int alloc_related_thread_groups(void)
{
+ int i, ret;
struct related_thread_group *grp;
- list_for_each_entry(grp, &related_thread_groups, list) {
- if (grp->id == group_id)
- return grp;
+ /* groupd_id = 0 is invalid as it's special id to remove group. */
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+ if (!grp) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (alloc_group_cputime(grp)) {
+ kfree(grp);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ grp->id = i;
+ INIT_LIST_HEAD(&grp->tasks);
+ INIT_LIST_HEAD(&grp->list);
+ raw_spin_lock_init(&grp->lock);
+
+ related_thread_groups[i] = grp;
}
- return NULL;
-}
+ return 0;
-/* See comments before preferred_cluster() */
-static void free_related_thread_group(struct rcu_head *rcu)
-{
- struct related_thread_group *grp = container_of(rcu, struct
- related_thread_group, rcu);
+err:
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = lookup_related_thread_group(i);
+ if (grp) {
+ free_group_cputime(grp);
+ kfree(grp);
+ related_thread_groups[i] = NULL;
+ } else {
+ break;
+ }
+ }
- free_group_cputime(grp);
- kfree(grp);
+ return ret;
}
static void remove_task_from_group(struct task_struct *p)
@@ -4030,10 +4045,12 @@ static void remove_task_from_group(struct task_struct *p)
raw_spin_unlock(&grp->lock);
/* Reserved groups cannot be destroyed */
- if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) {
- list_del(&grp->list);
- call_rcu(&grp->rcu, free_related_thread_group);
- }
+ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+ /*
+ * We test whether grp->list is attached with list_empty()
+ * hence re-init the list after deletion.
+ */
+ list_del_init(&grp->list);
}
static int
@@ -4105,53 +4122,15 @@ void add_new_task_to_grp(struct task_struct *new)
write_unlock_irqrestore(&related_thread_group_lock, flags);
}
-#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
-/*
- * We create a default colocation group at boot. There is no need to
- * synchronize tasks between cgroups at creation time because the
- * correct cgroup hierarchy is not available at boot. Therefore cgroup
- * colocation is turned off by default even though the colocation group
- * itself has been allocated. Furthermore this colocation group cannot
- * be destroyted once it has been created. All of this has been as part
- * of runtime optimizations.
- *
- * The job of synchronizing tasks to the colocation group is done when
- * the colocation flag in the cgroup is turned on.
- */
-static int __init create_default_coloc_group(void)
-{
- struct related_thread_group *grp = NULL;
- unsigned long flags;
-
- grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
- if (IS_ERR(grp)) {
- WARN_ON(1);
- return -ENOMEM;
- }
-
- write_lock_irqsave(&related_thread_group_lock, flags);
- list_add(&grp->list, &related_thread_groups);
- write_unlock_irqrestore(&related_thread_group_lock, flags);
-
- update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
- return 0;
-}
-late_initcall(create_default_coloc_group);
-
-int sync_cgroup_colocation(struct task_struct *p, bool insert)
-{
- unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
-
- return sched_set_group_id(p, grp_id);
-}
-#endif
-
-int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
{
int rc = 0;
unsigned long flags;
struct related_thread_group *grp = NULL;
+ if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+ return -EINVAL;
+
raw_spin_lock_irqsave(&p->pi_lock, flags);
write_lock(&related_thread_group_lock);
@@ -4167,29 +4146,26 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
}
grp = lookup_related_thread_group(group_id);
- if (!grp) {
- /* This is a reserved id */
- if (group_id == DEFAULT_CGROUP_COLOC_ID) {
- rc = -EINVAL;
- goto done;
- }
-
- grp = alloc_related_thread_group(group_id);
- if (IS_ERR(grp)) {
- rc = -ENOMEM;
- goto done;
- }
-
- list_add(&grp->list, &related_thread_groups);
- }
+ if (list_empty(&grp->list))
+ list_add(&grp->list, &active_related_thread_groups);
rc = add_task_to_group(p, grp);
done:
write_unlock(&related_thread_group_lock);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
return rc;
}
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ /* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+ if (group_id == DEFAULT_CGROUP_COLOC_ID)
+ return -EINVAL;
+
+ return __sched_set_group_id(p, group_id);
+}
+
unsigned int sched_get_group_id(struct task_struct *p)
{
unsigned int group_id;
@@ -4203,6 +4179,42 @@ unsigned int sched_get_group_id(struct task_struct *p)
return group_id;
}
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+ struct related_thread_group *grp = NULL;
+ unsigned long flags;
+
+ grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+ write_lock_irqsave(&related_thread_group_lock, flags);
+ list_add(&grp->list, &active_related_thread_groups);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+ update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+ return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+ unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+ return __sched_set_group_id(p, grp_id);
+}
+#endif
+
static void update_cpu_cluster_capacity(const cpumask_t *cpus)
{
int i;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30838bb9b442..f569c6fe3cbb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1448,6 +1448,8 @@ static inline void update_cgroup_boost_settings(void) { }
static inline void restore_cgroup_boost_settings(void) { }
#endif
+extern int alloc_related_thread_groups(void);
+
#else /* CONFIG_SCHED_HMP */
struct hmp_sched_stats;
@@ -1638,6 +1640,7 @@ static inline void set_hmp_defaults(void) { }
static inline void clear_reserved(int cpu) { }
static inline void sched_boost_parse_dt(void) {}
+static inline int alloc_related_thread_groups(void) { return 0; }
#define trace_sched_cpu_load(...)
#define trace_sched_cpu_load_lb(...)