diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/auditsc.c | 332 | ||||
| -rw-r--r-- | kernel/cgroup.c | 2 | ||||
| -rw-r--r-- | kernel/power/qos.c | 32 | ||||
| -rw-r--r-- | kernel/sched/core.c | 15 | ||||
| -rw-r--r-- | kernel/sched/core_ctl.c | 10 | ||||
| -rw-r--r-- | kernel/sched/hmp.c | 218 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 3 |
7 files changed, 317 insertions, 295 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..48f45987dc6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <uapi/linux/limits.h> #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; - /* walk the whole argument looking for non-ascii chars */ + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae83d9602aa0..8c9823947c7a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6002,7 +6002,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cset); + seq_printf(seq, "css_set %pK\n", cset); list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 69c32c42080f..582b66e882ce 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -358,7 +358,11 @@ int pm_qos_update_target(struct pm_qos_constraints *c, spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { + /* + * if cpu mask bits are set, call the notifier call chain + * to update the new qos restriction for the cores + */ + if (!cpumask_empty(&cpus)) { ret = 1; if (c->notifiers) blocking_notifier_call_chain(c->notifiers, @@ -592,7 +596,6 @@ void pm_qos_add_request(struct pm_qos_request *req, #ifdef CONFIG_SMP case PM_QOS_REQ_AFFINE_IRQ: if (irq_can_set_affinity(req->irq)) { - int ret = 0; struct irq_desc *desc = irq_to_desc(req->irq); struct cpumask *mask = desc->irq_data.common->affinity; @@ -602,13 +605,6 @@ void pm_qos_add_request(struct pm_qos_request *req, req->irq_notify.notify = pm_qos_irq_notify; req->irq_notify.release = pm_qos_irq_release; - ret = irq_set_affinity_notifier(req->irq, - &req->irq_notify); - if (ret) { - WARN(1, KERN_ERR "IRQ affinity notify set failed\n"); - req->type = PM_QOS_REQ_ALL_CORES; - cpumask_setall(&req->cpus_affine); - } } else { req->type = PM_QOS_REQ_ALL_CORES; cpumask_setall(&req->cpus_affine); @@ -630,6 +626,24 @@ void pm_qos_add_request(struct pm_qos_request *req, trace_pm_qos_add_request(pm_qos_class, value); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, req, PM_QOS_ADD_REQ, value); + +#ifdef CONFIG_SMP + if (req->type == PM_QOS_REQ_AFFINE_IRQ && + irq_can_set_affinity(req->irq)) { + int ret = 0; + + ret = irq_set_affinity_notifier(req->irq, + &req->irq_notify); + if (ret) { + WARN(1, "IRQ affinity notify set failed\n"); + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + pm_qos_update_target( + pm_qos_array[pm_qos_class]->constraints, + req, PM_QOS_UPDATE_REQ, value); + } + } +#endif } EXPORT_SYMBOL_GPL(pm_qos_add_request); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a5d101e8a5f2..d7846edd7a79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5600,7 +5600,6 @@ int do_isolation_work_cpu_stop(void *data) */ nohz_balance_clear_nohz_mask(cpu); - clear_hmp_request(cpu); local_irq_enable(); return 0; } @@ -5682,7 +5681,7 @@ int sched_isolate_cpu(int cpu) if (trace_sched_isolate_enabled()) start_time = sched_clock(); - lock_device_hotplug(); + cpu_maps_update_begin(); cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); @@ -5725,13 +5724,14 @@ int sched_isolate_cpu(int cpu) migrate_sync_cpu(cpu, cpumask_first(&avail_cpus)); stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + clear_hmp_request(cpu); calc_load_migrate(rq); update_max_interval(); sched_update_group_capacities(cpu); out: - unlock_device_hotplug(); + cpu_maps_update_done(); trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], start_time, 1); return ret_code; @@ -5752,8 +5752,6 @@ int sched_unisolate_cpu_unlocked(int cpu) if (trace_sched_isolate_enabled()) start_time = sched_clock(); - lock_device_hotplug_assert(); - if (!cpu_isolation_vote[cpu]) { ret_code = -EINVAL; goto out; @@ -5792,9 +5790,9 @@ int sched_unisolate_cpu(int cpu) { int ret_code; - lock_device_hotplug(); + cpu_maps_update_begin(); ret_code = sched_unisolate_cpu_unlocked(cpu); - unlock_device_hotplug(); + cpu_maps_update_done(); return ret_code; } @@ -8073,6 +8071,9 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + i = alloc_related_thread_groups(); + BUG_ON(i); + set_hmp_defaults(); set_load_weight(&init_task); diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 9b21a09ec4ba..aac12bfc2ae6 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -893,14 +893,10 @@ static int __ref cpu_callback(struct notifier_block *nfb, unsigned int need; int ret = NOTIFY_OK; - /* Don't affect suspend resume */ - if (action & CPU_TASKS_FROZEN) - return NOTIFY_OK; - if (unlikely(!cluster || !cluster->inited)) return NOTIFY_OK; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: /* If online state of CPU somehow got out of sync, fix it. */ @@ -1095,7 +1091,7 @@ static int __init core_ctl_init(void) cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); - lock_device_hotplug(); + cpu_maps_update_begin(); for_each_online_cpu(cpu) { struct cpufreq_policy *policy; int ret; @@ -1109,7 +1105,7 @@ static int __init core_ctl_init(void) cpufreq_cpu_put(policy); } } - unlock_device_hotplug(); + cpu_maps_update_done(); initialized = true; return 0; } diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 968a41e0e81e..6304c5030137 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -641,14 +641,18 @@ void clear_hmp_request(int cpu) clear_boost_kick(cpu); clear_reserved(cpu); if (rq->push_task) { + struct task_struct *push_task = NULL; + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->push_task) { clear_reserved(rq->push_cpu); - put_task_struct(rq->push_task); + push_task = rq->push_task; rq->push_task = NULL; } rq->active_balance = 0; raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); } } @@ -784,11 +788,12 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000; static unsigned int sync_cpu; -static LIST_HEAD(related_thread_groups); +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static LIST_HEAD(active_related_thread_groups); static DEFINE_RWLOCK(related_thread_group_lock); #define for_each_related_thread_group(grp) \ - list_for_each_entry(grp, &related_thread_groups, list) + list_for_each_entry(grp, &active_related_thread_groups, list) /* * Task load is categorized into buckets for the purpose of top task tracking. @@ -1767,20 +1772,20 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups) if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) return 0; } else { - read_lock(&related_thread_group_lock); + read_lock_irqsave(&related_thread_group_lock, flags); /* * Protect from concurrent update of rq->prev_runnable_sum and * group cpu load */ - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); if (check_groups) _group_load_in_cpu(cpu_of(rq), &group_load, NULL); new_load = rq->prev_runnable_sum + group_load; new_load = freq_policy_load(rq, new_load); - raw_spin_unlock_irqrestore(&rq->lock, flags); - read_unlock(&related_thread_group_lock); + raw_spin_unlock(&rq->lock); + read_unlock_irqrestore(&related_thread_group_lock, flags); cur_freq = load_to_freq(rq, rq->old_busy_time); freq_required = load_to_freq(rq, new_load); @@ -3052,7 +3057,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) read_unlock(&tasklist_lock); - list_for_each_entry(grp, &related_thread_groups, list) { + list_for_each_entry(grp, &active_related_thread_groups, list) { int j; for_each_possible_cpu(j) { @@ -3202,14 +3207,16 @@ void sched_get_cpus_busy(struct sched_load *busy, if (unlikely(cpus == 0)) return; + local_irq_save(flags); + + read_lock(&related_thread_group_lock); + /* * This function could be called in timer context, and the * current task may have been executing for a long time. Ensure * that the window stats are current by doing an update. */ - read_lock(&related_thread_group_lock); - local_irq_save(flags); for_each_cpu(cpu, query_cpus) raw_spin_lock(&cpu_rq(cpu)->lock); @@ -3309,10 +3316,11 @@ skip_early: for_each_cpu(cpu, query_cpus) raw_spin_unlock(&(cpu_rq(cpu))->lock); - local_irq_restore(flags); read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + i = 0; for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); @@ -3965,47 +3973,54 @@ _group_cpu_time(struct related_thread_group *grp, int cpu) return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; } -struct related_thread_group *alloc_related_thread_group(int group_id) +static inline struct related_thread_group* +lookup_related_thread_group(unsigned int group_id) { - struct related_thread_group *grp; - - grp = kzalloc(sizeof(*grp), GFP_ATOMIC); - if (!grp) - return ERR_PTR(-ENOMEM); - - if (alloc_group_cputime(grp)) { - kfree(grp); - return ERR_PTR(-ENOMEM); - } - - grp->id = group_id; - INIT_LIST_HEAD(&grp->tasks); - INIT_LIST_HEAD(&grp->list); - raw_spin_lock_init(&grp->lock); - - return grp; + return related_thread_groups[group_id]; } -struct related_thread_group *lookup_related_thread_group(unsigned int group_id) +int alloc_related_thread_groups(void) { + int i, ret; struct related_thread_group *grp; - list_for_each_entry(grp, &related_thread_groups, list) { - if (grp->id == group_id) - return grp; + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + if (alloc_group_cputime(grp)) { + kfree(grp); + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; } - return NULL; -} + return 0; -/* See comments before preferred_cluster() */ -static void free_related_thread_group(struct rcu_head *rcu) -{ - struct related_thread_group *grp = container_of(rcu, struct - related_thread_group, rcu); +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + free_group_cputime(grp); + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } - free_group_cputime(grp); - kfree(grp); + return ret; } static void remove_task_from_group(struct task_struct *p) @@ -4030,10 +4045,12 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); /* Reserved groups cannot be destroyed */ - if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { - list_del(&grp->list); - call_rcu(&grp->rcu, free_related_thread_group); - } + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + list_del_init(&grp->list); } static int @@ -4105,53 +4122,15 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } -#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) -/* - * We create a default colocation group at boot. There is no need to - * synchronize tasks between cgroups at creation time because the - * correct cgroup hierarchy is not available at boot. Therefore cgroup - * colocation is turned off by default even though the colocation group - * itself has been allocated. Furthermore this colocation group cannot - * be destroyted once it has been created. All of this has been as part - * of runtime optimizations. - * - * The job of synchronizing tasks to the colocation group is done when - * the colocation flag in the cgroup is turned on. - */ -static int __init create_default_coloc_group(void) -{ - struct related_thread_group *grp = NULL; - unsigned long flags; - - grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); - if (IS_ERR(grp)) { - WARN_ON(1); - return -ENOMEM; - } - - write_lock_irqsave(&related_thread_group_lock, flags); - list_add(&grp->list, &related_thread_groups); - write_unlock_irqrestore(&related_thread_group_lock, flags); - - update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); - return 0; -} -late_initcall(create_default_coloc_group); - -int sync_cgroup_colocation(struct task_struct *p, bool insert) -{ - unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; - - return sched_set_group_id(p, grp_id); -} -#endif - -int sched_set_group_id(struct task_struct *p, unsigned int group_id) +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct related_thread_group *grp = NULL; + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + raw_spin_lock_irqsave(&p->pi_lock, flags); write_lock(&related_thread_group_lock); @@ -4167,29 +4146,26 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) } grp = lookup_related_thread_group(group_id); - if (!grp) { - /* This is a reserved id */ - if (group_id == DEFAULT_CGROUP_COLOC_ID) { - rc = -EINVAL; - goto done; - } - - grp = alloc_related_thread_group(group_id); - if (IS_ERR(grp)) { - rc = -ENOMEM; - goto done; - } - - list_add(&grp->list, &related_thread_groups); - } + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return rc; } +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + unsigned int sched_get_group_id(struct task_struct *p) { unsigned int group_id; @@ -4203,6 +4179,42 @@ unsigned int sched_get_group_id(struct task_struct *p) return group_id; } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return __sched_set_group_id(p, grp_id); +} +#endif + static void update_cpu_cluster_capacity(const cpumask_t *cpus) { int i; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30838bb9b442..f569c6fe3cbb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1448,6 +1448,8 @@ static inline void update_cgroup_boost_settings(void) { } static inline void restore_cgroup_boost_settings(void) { } #endif +extern int alloc_related_thread_groups(void); + #else /* CONFIG_SCHED_HMP */ struct hmp_sched_stats; @@ -1638,6 +1640,7 @@ static inline void set_hmp_defaults(void) { } static inline void clear_reserved(int cpu) { } static inline void sched_boost_parse_dt(void) {} +static inline int alloc_related_thread_groups(void) { return 0; } #define trace_sched_cpu_load(...) #define trace_sched_cpu_load_lb(...) |
