summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c74
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup.c57
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/cpufreq_schedutil.c122
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/energy.c13
-rw-r--r--kernel/sched/fair.c782
-rw-r--r--kernel/sched/features.h26
-rw-r--r--kernel/sched/rt.c331
-rw-r--r--kernel/sched/sched.h37
-rw-r--r--kernel/sched/tune.c112
-rw-r--r--kernel/sched/walt.c2
-rw-r--r--kernel/softirq.c30
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/trace.c45
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
20 files changed, 1057 insertions, 597 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2b1a925489cf..d5c9dcfe9324 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -373,9 +373,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -604,7 +608,7 @@ static unsigned int __bpf_prog_run(const struct sk_buff *ctx, const struct bpf_i
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -613,12 +617,20 @@ static unsigned int __bpf_prog_run(const struct sk_buff *ctx, const struct bpf_i
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -856,6 +868,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -868,6 +892,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -880,6 +916,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -892,6 +940,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -1283,12 +1343,22 @@ const struct bpf_func_proto bpf_tail_call_proto = {
.arg3_type = ARG_ANYTHING,
};
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
return prog;
}
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
bool __weak bpf_helper_changes_skb_data(void *func)
{
return false;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a52c25ba18a..62c4ab07a7ac 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -189,7 +189,7 @@ static const struct inode_operations bpf_dir_iops = {
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
- .rename = simple_rename,
+ .rename2 = simple_rename,
.link = simple_link,
.unlink = simple_unlink,
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 78bdfbefd996..0bdb7c1b558d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3496,7 +3496,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* that doesn't support bpf_tail_call yet
*/
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c2508ca442b7..64a3b3db2484 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2848,44 +2848,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return ret;
}
-int subsys_cgroup_allow_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset)
-{
- const struct cred *cred = current_cred(), *tcred;
- struct task_struct *task;
-
- if (capable(CAP_SYS_NICE))
- return 0;
-
- cgroup_taskset_for_each(task, css, tset) {
- tcred = __task_cred(task);
-
- if (current != task && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- return -EACCES;
- }
-
- return 0;
-}
-
-static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
- int i;
- int ret;
-
- for_each_css(css, i, cgrp) {
- if (css->ss->allow_attach) {
- ret = css->ss->allow_attach(css, tset);
- if (ret)
- return ret;
- } else {
- return -EACCES;
- }
- }
-
- return 0;
-}
-
static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
@@ -2901,23 +2863,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
!uid_eq(cred->euid, tcred->suid) &&
- !ns_capable(tcred->user_ns, CAP_SYS_NICE)) {
- /*
- * if the default permission check fails, give each
- * cgroup a chance to extend the permission check
- */
- struct cgroup_taskset tset = {
- .src_csets = LIST_HEAD_INIT(tset.src_csets),
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
- .csets = &tset.src_csets,
- };
- struct css_set *cset;
- cset = task_css_set(task);
- list_add(&cset->mg_node, &tset.src_csets);
- ret = cgroup_allow_attach(dst_cgrp, &tset);
- if (ret)
- ret = -EACCES;
- }
+ !ns_capable(tcred->user_ns, CAP_SYS_NICE))
+ ret = -EACCES;
if (!ret && cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
diff --git a/kernel/compat.c b/kernel/compat.c
index 333d364be29d..fd31231bab25 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -644,7 +644,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a8d2c50737ee..f17cfe5a313f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1170,13 +1170,10 @@ struct migration_arg {
*/
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
- int src_cpu;
-
/* Affinity changed (again). */
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- src_cpu = cpu_of(rq);
rq = move_queued_task(rq, p, dest_cpu);
return rq;
@@ -5064,14 +5061,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
size_t retlen = min_t(size_t, len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6effb44aeb30..869a125ebb87 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,6 +82,7 @@ struct sugov_cpu {
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables);
/************************ Governor internals ***********************/
@@ -89,16 +90,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
- if (sg_policy->work_in_progress)
- return false;
-
if (unlikely(sg_policy->need_freq_update)) {
- sg_policy->need_freq_update = false;
- /*
- * This happens when limits change, so forget the previous
- * next_freq value and force an update.
- */
- sg_policy->next_freq = UINT_MAX;
return true;
}
@@ -150,7 +142,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else {
+ } else if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -187,8 +179,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -234,6 +228,15 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
if (!sg_policy->tunables->iowait_boost_enable)
return;
+ if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC) {
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
+ }
if (flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
@@ -247,14 +250,6 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
} else {
sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
}
- } else if (sg_cpu->iowait_boost) {
- s64 delta_ns = time - sg_cpu->last_update;
-
- /* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC) {
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_pending = false;
- }
}
}
@@ -311,6 +306,13 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
+ /*
+ * For slow-switch systems, single policy requests can't run at the
+ * moment if update is in progress, unless we acquire update_lock.
+ */
+ if (sg_policy->work_in_progress)
+ return;
+
if (!sugov_should_update_freq(sg_policy, time))
return;
@@ -326,7 +328,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq) {
+ if (busy && next_f < sg_policy->next_freq &&
+ sg_policy->next_freq != UINT_MAX) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
@@ -366,7 +369,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
- if (j_util * max > j_max * util) {
+ if (j_util * max >= j_max * util) {
util = j_util;
max = j_max;
}
@@ -411,13 +414,27 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_update_shared just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
- __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
- CPUFREQ_RELATION_L);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
-
- sg_policy->work_in_progress = false;
}
static void sugov_irq_work(struct irq_work *irq_work)
@@ -640,6 +657,29 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
+static void sugov_tunables_save(struct cpufreq_policy *policy,
+ struct sugov_tunables *tunables)
+{
+ int cpu;
+ struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+ if (!have_governor_per_policy())
+ return;
+
+ if (!cached) {
+ cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (!cached) {
+ pr_warn("Couldn't allocate tunables for caching\n");
+ return;
+ }
+ for_each_cpu(cpu, policy->related_cpus)
+ per_cpu(cached_tunables, cpu) = cached;
+ }
+
+ cached->up_rate_limit_us = tunables->up_rate_limit_us;
+ cached->down_rate_limit_us = tunables->down_rate_limit_us;
+}
+
static void sugov_tunables_free(struct sugov_tunables *tunables)
{
if (!have_governor_per_policy())
@@ -648,6 +688,25 @@ static void sugov_tunables_free(struct sugov_tunables *tunables)
kfree(tunables);
}
+static void sugov_tunables_restore(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+ if (!cached)
+ return;
+
+ tunables->up_rate_limit_us = cached->up_rate_limit_us;
+ tunables->down_rate_limit_us = cached->down_rate_limit_us;
+ sg_policy->up_rate_delay_ns =
+ tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ tunables->down_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+}
+
static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
@@ -710,6 +769,8 @@ static int sugov_init(struct cpufreq_policy *policy)
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
+ sugov_tunables_restore(policy);
+
ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
get_governor_parent_kobj(policy), "%s",
cpufreq_gov_schedutil.name);
@@ -749,8 +810,10 @@ static int sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
- if (!count)
+ if (!count) {
+ sugov_tunables_save(policy, tunables);
sugov_tunables_free(tunables);
+ }
mutex_unlock(&global_tunables_lock);
@@ -772,7 +835,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_us(sg_policy);
sg_policy->last_freq_update_time = 0;
- sg_policy->next_freq = UINT_MAX;
+ sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -784,6 +847,11 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_DL;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ }
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 188c8388a63f..d40995e9cf5f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1038,6 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
@@ -1053,6 +1054,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
index 50d183b1e156..770624996f9f 100644
--- a/kernel/sched/energy.c
+++ b/kernel/sched/energy.c
@@ -91,11 +91,17 @@ void init_sched_energy_costs(void)
sge = kcalloc(1, sizeof(struct sched_group_energy),
GFP_NOWAIT);
+ if (!sge)
+ goto out;
nstates = (prop->length / sizeof(u32)) / 2;
cap_states = kcalloc(nstates,
sizeof(struct capacity_state),
GFP_NOWAIT);
+ if (!cap_states) {
+ kfree(sge);
+ goto out;
+ }
for (i = 0, val = prop->value; i < nstates; i++) {
cap_states[i].cap = be32_to_cpup(val++);
@@ -108,6 +114,8 @@ void init_sched_energy_costs(void)
prop = of_find_property(cp, "idle-cost-data", NULL);
if (!prop || !prop->value) {
pr_warn("No idle-cost data, skipping sched_energy init\n");
+ kfree(sge);
+ kfree(cap_states);
goto out;
}
@@ -115,6 +123,11 @@ void init_sched_energy_costs(void)
idle_states = kcalloc(nstates,
sizeof(struct idle_state),
GFP_NOWAIT);
+ if (!idle_states) {
+ kfree(sge);
+ kfree(cap_states);
+ goto out;
+ }
for (i = 0, val = prop->value; i < nstates; i++)
idle_states[i].power = be32_to_cpup(val++);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 266fc95f6c0f..43c3d2684f64 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3690,68 +3690,6 @@ static inline int migration_needed(struct task_struct *p, int cpu)
return 0;
}
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
- unsigned long flags;
- int rc = 0;
-
- /* Invoke active balance to force migrate currently running task */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (!rq->active_balance) {
- rq->active_balance = 1;
- rq->push_cpu = new_cpu;
- get_task_struct(p);
- rq->push_task = p;
- rc = 1;
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-
- return rc;
-}
-
-static DEFINE_RAW_SPINLOCK(migration_lock);
-
-static bool do_migration(int reason, int new_cpu, int cpu)
-{
- if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
- && same_cluster(new_cpu, cpu))
- return false;
-
- /* Inter cluster high irqload migrations are OK */
- return new_cpu != cpu;
-}
-
-/*
- * Check if currently running task should be migrated to a better cpu.
- *
- * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
- */
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
- int cpu = cpu_of(rq), new_cpu;
- int active_balance = 0, reason;
-
- reason = migration_needed(p, cpu);
- if (!reason)
- return;
-
- raw_spin_lock(&migration_lock);
- new_cpu = select_best_cpu(p, cpu, reason, 0);
-
- if (do_migration(reason, new_cpu, cpu)) {
- active_balance = kick_active_balance(rq, p, new_cpu);
- if (active_balance)
- mark_reserved(new_cpu);
- }
-
- raw_spin_unlock(&migration_lock);
-
- if (active_balance)
- stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
- &rq->active_balance_work);
-}
-
#ifdef CONFIG_CFS_BANDWIDTH
static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
@@ -5060,12 +4998,9 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
if (cfs_b->quota == RUNTIME_INF)
return;
- now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
}
@@ -6062,6 +5997,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
dec_rq_hmp_stats(rq, p, 1);
}
+#ifdef CONFIG_SMP
+ if (energy_aware() && !se)
+ walt_dec_cumulative_runnable_avg(rq, p);
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
@@ -6472,28 +6412,79 @@ unsigned long capacity_curr_of(int cpu)
>> SCHED_CAPACITY_SHIFT;
}
+/*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV 0
+#define EAS_CPU_NXT 1
+#define EAS_CPU_BKP 2
+#define EAS_CPU_CNT 3
+
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
+ */
+unsigned long capacity_min_of(int cpu)
+{
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
+ return 0;
+ return arch_scale_cpu_capacity(NULL, cpu) *
+ arch_scale_min_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
struct energy_env {
- struct sched_group *sg_top;
- struct sched_group *sg_cap;
- int cap_idx;
+ /* Utilization to move */
+ struct task_struct *p;
int util_delta;
- int src_cpu;
- int dst_cpu;
- int trg_cpu;
- int energy;
- int payoff;
- struct task_struct *task;
- struct {
- int before;
- int after;
- int delta;
- int diff;
- } nrg;
+
+ /* Mask of CPUs candidates to evaluate */
+ cpumask_t cpus_mask;
+
+ /* CPU candidates to evaluate */
struct {
- int before;
- int after;
- int delta;
- } cap;
+
+ /* CPU ID, must be in cpus_mask */
+ int cpu_id;
+
+ /*
+ * Index (into sched_group_energy::cap_states) of the OPP the
+ * CPU needs to run at if the task is placed on it.
+ * This includes the both active and blocked load, due to
+ * other tasks on this CPU, as well as the task's own
+ * utilization.
+ */
+ int cap_idx;
+ int cap;
+
+ /* Estimated system energy */
+ unsigned int energy;
+
+ /* Estimated energy variation wrt EAS_CPU_PRV */
+ int nrg_delta;
+
+ } cpu[EAS_CPU_CNT];
+
+ /*
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
+ * the specified energy_env::task
+ */
+ int next_idx;
+
+ /* Support data */
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ struct sched_group *sg;
};
static int cpu_util_wake(int cpu, struct task_struct *p);
@@ -6521,24 +6512,33 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static unsigned long group_max_util(struct energy_env *eenv)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
{
unsigned long max_util = 0;
unsigned long util;
int cpu;
for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
- util = cpu_util_wake(cpu, eenv->task);
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
max_util = max(max_util, util);
+
+ /*
+ * Take into account any minimum frequency imposed
+ * elsewhere which limits the energy states available
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
+ * capacity_min_of will return 0 (not capped).
+ */
+ max_util = max(max_util, capacity_min_of(cpu));
+
}
return max_util;
@@ -6556,21 +6556,21 @@ static unsigned long group_max_util(struct energy_env *eenv)
* estimate (more busy).
*/
static unsigned
-long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
{
- unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
unsigned long util, util_sum = 0;
int cpu;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
- util = cpu_util_wake(cpu, eenv->task);
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
util_sum += __cpu_norm_util(util, capacity);
@@ -6579,27 +6579,31 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
-static int find_new_capacity(struct energy_env *eenv,
- const struct sched_group_energy * const sge)
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
{
+ const struct sched_group_energy *sge = eenv->sg->sge;
int idx, max_idx = sge->nr_cap_states - 1;
- unsigned long util = group_max_util(eenv);
+ unsigned long util = group_max_util(eenv, cpu_idx);
/* default is max_cap if we don't find a match */
- eenv->cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
for (idx = 0; idx < sge->nr_cap_states; idx++) {
if (sge->cap_states[idx].cap >= util) {
- eenv->cap_idx = idx;
+ /* Keep track of SG's capacity */
+ eenv->cpu[cpu_idx].cap_idx = idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
break;
}
}
- return eenv->cap_idx;
+ return eenv->cpu[cpu_idx].cap_idx;
}
-static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
{
+ struct sched_group *sg = eenv->sg;
int i, state = INT_MAX;
int src_in_grp, dst_in_grp;
long grp_util = 0;
@@ -6611,8 +6615,10 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
- src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
- dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+ sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+ sched_group_cpus(sg));
if (src_in_grp == dst_in_grp) {
/* both CPUs under consideration are in the same group or not in
* either group, migration should leave idle state the same.
@@ -6625,8 +6631,8 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
* achievable when we move the task.
*/
for_each_cpu(i, sched_group_cpus(sg)) {
- grp_util += cpu_util_wake(i, eenv->task);
- if (unlikely(i == eenv->trg_cpu))
+ grp_util += cpu_util_wake(i, eenv->p);
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
grp_util += eenv->util_delta;
}
@@ -6662,19 +6668,65 @@ end:
}
/*
- * sched_group_energy(): Computes the absolute energy consumption of cpus
- * belonging to the sched_group including shared resources shared only by
- * members of the group. Iterates over all cpus in the hierarchy below the
- * sched_group starting from the bottom working it's way up before going to
- * the next cpu until all cpus are covered at all levels. The current
- * implementation is likely to gather the same util statistics multiple times.
- * This can probably be done in a faster but more complex way.
- * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ *
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
*/
-static int sched_group_energy(struct energy_env *eenv)
+static void calc_sg_energy(struct energy_env *eenv)
+{
+ struct sched_group *sg = eenv->sg;
+ int busy_energy, idle_energy;
+ unsigned int busy_power;
+ unsigned int idle_power;
+ unsigned long sg_util;
+ int cap_idx, idle_idx;
+ int total_energy = 0;
+ int cpu_idx;
+
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+
+
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
+ continue;
+ /* Compute ACTIVE energy */
+ cap_idx = find_new_capacity(eenv, cpu_idx);
+ busy_power = sg->sge->cap_states[cap_idx].power;
+ /*
+ * in order to calculate cpu_norm_util, we need to know which
+ * capacity level the group will be at, so calculate that first
+ */
+ sg_util = group_norm_util(eenv, cpu_idx);
+
+ busy_energy = sg_util * busy_power;
+
+ /* Compute IDLE energy */
+ idle_idx = group_idle_state(eenv, cpu_idx);
+ idle_power = sg->sge->idle_states[idle_idx].power;
+
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
+ idle_energy *= idle_power;
+
+ total_energy = busy_energy + idle_energy;
+ eenv->cpu[cpu_idx].energy += total_energy;
+ }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ * which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
{
struct cpumask visit_cpus;
- u64 total_energy = 0;
int cpu_count;
WARN_ON(!eenv->sg_top->sge);
@@ -6716,41 +6768,18 @@ static int sched_group_energy(struct energy_env *eenv)
break;
do {
- unsigned long group_util;
- int sg_busy_energy, sg_idle_energy;
- int cap_idx, idle_idx;
-
+ eenv->sg_cap = sg;
if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
eenv->sg_cap = sg_shared_cap;
- else
- eenv->sg_cap = sg;
- cap_idx = find_new_capacity(eenv, sg->sge);
-
- if (sg->group_weight == 1) {
- /* Remove capacity of src CPU (before task move) */
- if (eenv->trg_cpu == eenv->src_cpu &&
- cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
- eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta -= eenv->cap.before;
- }
- /* Add capacity of dst CPU (after task move) */
- if (eenv->trg_cpu == eenv->dst_cpu &&
- cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
- eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta += eenv->cap.after;
- }
- }
-
- idle_idx = group_idle_state(eenv, sg);
- group_util = group_norm_util(eenv, sg);
-
- sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
- sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
- * sg->sge->idle_states[idle_idx].power);
-
- total_energy += sg_busy_energy + sg_idle_energy;
+ /*
+ * Compute the energy for all the candidate
+ * CPUs in the current visited SG.
+ */
+ eenv->sg = sg;
+ calc_sg_energy(eenv);
+ /* remove CPUs we have just visited */
if (!sd->child) {
/*
* cpu_count here is the number of
@@ -6791,7 +6820,6 @@ next_cpu:
continue;
}
- eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
return 0;
}
@@ -6800,181 +6828,101 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
-static inline unsigned long task_util(struct task_struct *p);
-
/*
- * energy_diff(): Estimate the energy impact of changing the utilization
- * distribution. eenv specifies the change: utilisation amount, source, and
- * destination cpu. Source or destination cpu may be -1 in which case the
- * utilization is removed from or added to the system (e.g. task wake-up). If
- * both are specified, the utilization is migrated.
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
*/
-static inline int __energy_diff(struct energy_env *eenv)
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
- int sd_cpu = -1, energy_before = 0, energy_after = 0;
- int diff, margin;
-
- struct energy_env eenv_before = {
- .util_delta = task_util(eenv->task),
- .src_cpu = eenv->src_cpu,
- .dst_cpu = eenv->dst_cpu,
- .trg_cpu = eenv->src_cpu,
- .nrg = { 0, 0, 0, 0},
- .cap = { 0, 0, 0 },
- .task = eenv->task,
- };
+ int sd_cpu = -1;
+ int cpu_idx;
+ int margin;
- if (eenv->src_cpu == eenv->dst_cpu)
- return 0;
-
- sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
-
if (!sd)
- return 0; /* Error */
+ return EAS_CPU_PRV;
- sg = sd->groups;
+ cpumask_clear(&eenv->cpus_mask);
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
- do {
- if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
- eenv_before.sg_top = eenv->sg_top = sg;
+ if (cpu < 0)
+ continue;
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
+ }
- if (sched_group_energy(&eenv_before))
- return 0; /* Invalid result abort */
- energy_before += eenv_before.energy;
+ sg = sd->groups;
+ do {
+ /* Skip SGs which do not contains a candidate CPU */
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
+ continue;
- /* Keep track of SRC cpu (before) capacity */
- eenv->cap.before = eenv_before.cap.before;
- eenv->cap.delta = eenv_before.cap.delta;
+ eenv->sg_top = sg;
+ /* energy is unscaled to reduce rounding errors */
+ if (compute_energy(eenv) == -EINVAL)
+ return EAS_CPU_PRV;
- if (sched_group_energy(eenv))
- return 0; /* Invalid result abort */
- energy_after += eenv->energy;
- }
} while (sg = sg->next, sg != sd->groups);
- eenv->nrg.before = energy_before;
- eenv->nrg.after = energy_after;
- eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
- eenv->payoff = 0;
-#ifndef CONFIG_SCHED_TUNE
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-#endif
+ /* Scale energy before comparisons */
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
+
/*
- * Dead-zone margin preventing too many migrations.
+ * Compute the dead-zone margin used to prevent too many task
+ * migrations with negligible energy savings.
+ * An energy saving is considered meaningful if it reduces the energy
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
*/
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
- margin = eenv->nrg.before >> 6; /* ~1.56% */
-
- diff = eenv->nrg.after - eenv->nrg.before;
-
- eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
-
- return eenv->nrg.diff;
-}
-
-#ifdef CONFIG_SCHED_TUNE
-
-struct target_nrg schedtune_target_nrg;
-
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-extern bool schedtune_initialized;
-#endif /* CONFIG_CGROUP_SCHEDTUNE */
-
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- * corresponding to the specified energy variation.
- */
-static inline int
-normalize_energy(int energy_diff)
-{
- u32 normalized_nrg;
-
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- /* during early setup, we don't know the extents */
- if (unlikely(!schedtune_initialized))
- return energy_diff < 0 ? -1 : 1 ;
-#endif /* CONFIG_CGROUP_SCHEDTUNE */
-
-#ifdef CONFIG_SCHED_DEBUG
- {
- int max_delta;
+ /*
+ * By default the EAS_CPU_PRV CPU is considered the most energy
+ * efficient, with a 0 energy variation.
+ */
+ eenv->next_idx = EAS_CPU_PRV;
- /* Check for boundaries */
- max_delta = schedtune_target_nrg.max_power;
- max_delta -= schedtune_target_nrg.min_power;
- WARN_ON(abs(energy_diff) >= max_delta);
+ /*
+ * Compare the other CPU candidates to find a CPU which can be
+ * more energy efficient then EAS_CPU_PRV
+ */
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ /* Skip not valid scheduled candidates */
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
+ continue;
+ /* Compute energy delta wrt EAS_CPU_PRV */
+ eenv->cpu[cpu_idx].nrg_delta =
+ eenv->cpu[cpu_idx].energy -
+ eenv->cpu[EAS_CPU_PRV].energy;
+ /* filter energy variations within the dead-zone margin */
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+ eenv->cpu[cpu_idx].nrg_delta = 0;
+ /* update the schedule candidate with min(nrg_delta) */
+ if (eenv->cpu[cpu_idx].nrg_delta <
+ eenv->cpu[eenv->next_idx].nrg_delta) {
+ eenv->next_idx = cpu_idx;
+ if (sched_feat(FBT_STRICT_ORDER))
+ break;
+ }
}
-#endif
-
- /* Do scaling using positive numbers to increase the range */
- normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
- /* Scale by energy magnitude */
- normalized_nrg <<= SCHED_CAPACITY_SHIFT;
-
- /* Normalize on max energy for target platform */
- normalized_nrg = reciprocal_divide(
- normalized_nrg, schedtune_target_nrg.rdiv);
- return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+ return eenv->next_idx;
}
-static inline int
-energy_diff(struct energy_env *eenv)
-{
- int boost = schedtune_task_boost(eenv->task);
- int nrg_delta;
-
- /* Conpute "absolute" energy diff */
- __energy_diff(eenv);
-
- /* Return energy diff when boost margin is 0 */
- if (boost == 0) {
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- 0, -eenv->nrg.diff);
- return eenv->nrg.diff;
- }
-
- /* Compute normalized energy diff */
- nrg_delta = normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-
- /*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
- */
- return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff(eenv) __energy_diff(eenv)
-#endif
-
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
@@ -7069,18 +7017,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return 1;
}
-static inline unsigned long task_util(struct task_struct *p)
-{
-#ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
- unsigned long demand = p->ravg.demand;
- return (demand << 10) / walt_ravg_window;
- }
-#endif
- return p->se.avg.util_avg;
-}
-
-static inline unsigned long boosted_task_util(struct task_struct *task);
+static inline unsigned long boosted_task_util(struct task_struct *p);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
{
@@ -7157,16 +7094,16 @@ schedtune_cpu_margin(unsigned long util, int cpu)
}
static inline long
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
- int boost = schedtune_task_boost(task);
+ int boost = schedtune_task_boost(p);
unsigned long util;
long margin;
if (boost == 0)
return 0;
- util = task_util(task);
+ util = task_util(p);
margin = schedtune_margin(util, boost);
return margin;
@@ -7181,7 +7118,7 @@ schedtune_cpu_margin(unsigned long util, int cpu)
}
static inline int
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
return 0;
}
@@ -7200,12 +7137,12 @@ boosted_cpu_util(int cpu)
}
static inline unsigned long
-boosted_task_util(struct task_struct *task)
+boosted_task_util(struct task_struct *p)
{
- unsigned long util = task_util(task);
- long margin = schedtune_task_margin(task);
+ unsigned long util = task_util(p);
+ long margin = schedtune_task_margin(p);
- trace_sched_boost_task(task, util, margin);
+ trace_sched_boost_task(p, util, margin);
return util + margin;
}
@@ -7575,6 +7512,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
unsigned long min_wake_util = ULONG_MAX;
unsigned long target_max_spare_cap = 0;
unsigned long best_active_util = ULONG_MAX;
+ unsigned long target_idle_max_spare_cap = 0;
int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
struct sched_group *sg;
@@ -7610,7 +7548,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
unsigned long capacity_curr = capacity_curr_of(i);
unsigned long capacity_orig = capacity_orig_of(i);
- unsigned long wake_util, new_util;
+ unsigned long wake_util, new_util, min_capped_util;
if (!cpu_online(i))
continue;
@@ -7632,13 +7570,18 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
- if (new_util > capacity_orig)
- continue;
-#ifdef CONFIG_SCHED_WALT
- if (walt_cpu_high_irqload(i))
+ /*
+ * Include minimum capacity constraint:
+ * new_util contains the required utilization including
+ * boost. min_capped_util also takes into account a
+ * minimum capacity cap imposed on the CPU by external
+ * actors.
+ */
+ min_capped_util = max(new_util, capacity_min_of(i));
+
+ if (new_util > capacity_orig)
continue;
-#endif
/*
* Case A) Latency sensitive tasks
@@ -7759,6 +7702,12 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
/* Select idle CPU with lower cap_orig */
if (capacity_orig > best_idle_min_cap_orig)
continue;
+ /* Favor CPUs that won't end up running at a
+ * high OPP.
+ */
+ if ((capacity_orig - min_capped_util) <
+ target_idle_max_spare_cap)
+ continue;
/*
* Skip CPUs in deeper idle state, but only
@@ -7772,6 +7721,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
/* Keep track of best idle CPU */
best_idle_min_cap_orig = capacity_orig;
+ target_idle_max_spare_cap = capacity_orig -
+ min_capped_util;
best_idle_cstate = idle_idx;
best_idle_cpu = i;
continue;
@@ -7802,10 +7753,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
continue;
/* Favor CPUs with maximum spare capacity */
- if ((capacity_orig - new_util) < target_max_spare_cap)
+ if ((capacity_orig - min_capped_util) <
+ target_max_spare_cap)
continue;
- target_max_spare_cap = capacity_orig - new_util;
+ target_max_spare_cap = capacity_orig - min_capped_util;
target_capacity = capacity_orig;
target_cpu = i;
}
@@ -7877,9 +7829,11 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
{
- struct sched_domain *sd;
- int target_cpu = prev_cpu, tmp_target, tmp_backup;
bool boosted, prefer_idle;
+ struct sched_domain *sd;
+ int target_cpu;
+ int backup_cpu;
+ int next_cpu;
schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
schedstat_inc(this_rq(), eas_stats.secb_attempts);
@@ -7894,7 +7848,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
}
}
- rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
boosted = schedtune_task_boost(p) > 0;
prefer_idle = schedtune_prefer_idle(p) > 0;
@@ -7903,31 +7856,49 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
prefer_idle = 0;
#endif
- sync_entity_load_avg(&p->se);
+ rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ if (!sd) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ sync_entity_load_avg(&p->se);
+
/* Find a cpu with sufficient capacity */
- tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
+ if (next_cpu == -1) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
- if (!sd)
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ target_cpu = next_cpu;
goto unlock;
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
- schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
- schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
- goto unlock;
- }
}
- if (target_cpu != prev_cpu) {
+ target_cpu = prev_cpu;
+ if (next_cpu != prev_cpu) {
int delta = 0;
struct energy_env eenv = {
+ .p = p,
.util_delta = task_util(p),
- .src_cpu = prev_cpu,
- .dst_cpu = target_cpu,
- .task = p,
- .trg_cpu = target_cpu,
+ /* Task's previous CPU candidate */
+ .cpu[EAS_CPU_PRV] = {
+ .cpu_id = prev_cpu,
+ },
+ /* Main alternative CPU candidate */
+ .cpu[EAS_CPU_NXT] = {
+ .cpu_id = next_cpu,
+ },
+ /* Backup alternative CPU candidate */
+ .cpu[EAS_CPU_BKP] = {
+ .cpu_id = backup_cpu,
+ },
};
@@ -7940,26 +7911,21 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
if (__cpu_overutilized(prev_cpu, delta)) {
schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ target_cpu = next_cpu;
goto unlock;
}
- if (energy_diff(&eenv) >= 0) {
- /* No energy saving for target_cpu, try backup */
- target_cpu = tmp_backup;
- eenv.dst_cpu = target_cpu;
- eenv.trg_cpu = target_cpu;
- if (tmp_backup < 0 ||
- tmp_backup == prev_cpu ||
- energy_diff(&eenv) >= 0) {
- schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
- schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
- target_cpu = prev_cpu;
- goto unlock;
- }
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+ goto unlock;
}
- schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
- schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
goto unlock;
}
@@ -9339,6 +9305,9 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
@@ -10373,6 +10342,17 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a cpu that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-cpu capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(i);
/*
@@ -11677,6 +11657,92 @@ static void rq_offline_fair(struct rq *rq)
unthrottle_offline_cfs_rqs(rq);
}
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+#ifdef CONFIG_SCHED_HMP
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (do_migration(reason, new_cpu, cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+#else
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int new_cpu;
+ int active_balance;
+ int cpu = task_cpu(p);
+
+ if (rq->misfit_task) {
+ if (rq->curr->state != TASK_RUNNING ||
+ rq->curr->nr_cpus_allowed == 1)
+ return;
+
+ new_cpu = select_energy_cpu_brute(p, cpu, 0);
+ if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ stop_one_cpu_nowait(cpu,
+ active_load_balance_cpu_stop,
+ rq, &rq->active_balance_work);
+ }
+ }
+}
+#endif
+
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index c30c48fde7e6..c3e301589515 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -78,3 +78,29 @@ SCHED_FEAT(ENERGY_AWARE, true)
#else
SCHED_FEAT(ENERGY_AWARE, false)
#endif
+
+/*
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
+ */
+SCHED_FEAT(MIN_CAPACITY_CAPPING, true)
+
+/*
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(FBT_STRICT_ORDER, true)
+
+/*
+ * Apply schedtune boost hold to tasks of all sched classes.
+ * If enabled, schedtune will hold the boost applied to a CPU
+ * for 50ms regardless of task activation - if the task is
+ * still running 50ms later, the boost hold expires and schedtune
+ * boost will expire immediately the task stops.
+ * If disabled, this behaviour will only apply to tasks of the
+ * RT class.
+ */
+SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ac81704e14d9..9d7f6998edd5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1439,6 +1439,25 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
}
/*
+ * Keep track of whether each cpu has an RT task that will
+ * soon schedule on that core. The problem this is intended
+ * to address is that we want to avoid entering a non-preemptible
+ * softirq handler if we are about to schedule a real-time
+ * task on that core. Ideally, we could just check whether
+ * the RT runqueue on that core had a runnable task, but the
+ * window between choosing to schedule a real-time task
+ * on a core and actually enqueueing it on that run-queue
+ * is large enough to lose races at an unacceptably high rate.
+ *
+ * This variable attempts to reduce that window by indicating
+ * when we have decided to schedule an RT task on a core
+ * but not yet enqueued it.
+ * This variable is a heuristic only: it is not guaranteed
+ * to be correct and may be updated without synchronization.
+ */
+DEFINE_PER_CPU(bool, incoming_rt_task);
+
+/*
* Adding/removing a task to/from a priority array:
*/
static void
@@ -1459,6 +1478,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+ *per_cpu_ptr(&incoming_rt_task, cpu_of(rq)) = false;
if (!schedtune_task_boost(p))
return;
@@ -1551,8 +1571,19 @@ static void yield_task_rt(struct rq *rq)
requeue_task_rt(rq, rq->curr, 0);
}
+/*
+ * Return whether the given cpu has (or will shortly have) an RT task
+ * ready to run. NB: This is a heuristic and is subject to races.
+ */
+bool
+cpu_has_rt_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ return rq->rt.rt_nr_running > 0 || per_cpu(incoming_rt_task, cpu);
+}
+
#ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
+static int find_lowest_rq(struct task_struct *task, int sync);
#ifdef CONFIG_SCHED_HMP
static int
@@ -1561,7 +1592,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
int target;
rcu_read_lock();
- target = find_lowest_rq(p);
+ target = find_lowest_rq(p, 0);
if (target != -1)
cpu = target;
rcu_read_unlock();
@@ -1573,8 +1604,10 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
/*
* Return whether the task on the given cpu is currently non-preemptible
* while handling a potentially long softint, or if the task is likely
- * to block preemptions soon because it is a ksoftirq thread that is
- * handling slow softints.
+ * to block preemptions soon because (a) it is a ksoftirq thread that is
+ * handling slow softints, (b) it is idle and therefore likely to start
+ * processing the irq's immediately, (c) the cpu is currently handling
+ * hard irq's and will soon move on to the softirq handler.
*/
bool
task_may_not_preempt(struct task_struct *task, int cpu)
@@ -1584,8 +1617,9 @@ task_may_not_preempt(struct task_struct *task, int cpu)
struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
return ((softirqs & LONG_SOFTIRQ_MASK) &&
- (task == cpu_ksoftirqd ||
- task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+ (task == cpu_ksoftirqd || is_idle_task(task) ||
+ (task_thread_info(task)->preempt_count
+ & (HARDIRQ_MASK | SOFTIRQ_MASK))));
}
/*
@@ -1618,9 +1652,11 @@ static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
int sibling_count_hint)
{
- struct task_struct *curr;
+ struct task_struct *curr, *tgt_task;
struct rq *rq;
bool may_not_preempt;
+ int target;
+ int sync = flags & WF_SYNC;
#ifdef CONFIG_SCHED_HMP
return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
@@ -1635,58 +1671,28 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
- /*
- * If the current task on @p's runqueue is a softirq task,
- * it may run without preemption for a time that is
- * ill-suited for a waiting RT task. Therefore, try to
- * wake this RT task on another runqueue.
- *
- * Also, if the current task on @p's runqueue is an RT task, then
- * it may run without preemption for a time that is
- * ill-suited for a waiting RT task. Therefore, try to
- * wake this RT task on another runqueue.
- *
- * Also, if the current task on @p's runqueue is an RT task, then
- * try to see if we can wake this RT task up on another
- * runqueue. Otherwise simply start this RT task
- * on its current runqueue.
- *
- * We want to avoid overloading runqueues. If the woken
- * task is a higher priority, then it will stay on this CPU
- * and the lower prio task should be moved to another CPU.
- * Even though this will probably make the lower prio task
- * lose its cache, we do not want to bounce a higher task
- * around just because it gave up its CPU, perhaps for a
- * lock?
- *
- * For equal prio tasks, we just let the scheduler sort it out.
- *
- * Otherwise, just let it ride on the affined RQ and the
- * post-schedule router will push the preempted task away
- *
- * This test is optimistic, if we get it wrong the load-balancer
- * will have to sort it out.
- */
may_not_preempt = task_may_not_preempt(curr, cpu);
- if (may_not_preempt ||
- (unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio))) {
- int target = find_lowest_rq(p);
+ target = find_lowest_rq(p, sync);
- /*
- * If cpu is non-preemptible, prefer remote cpu
- * even if it's running a higher-prio task.
- * Otherwise: Don't bother moving it if the
- * destination CPU is not running a lower priority task.
- */
- if (target != -1 &&
- (may_not_preempt ||
- p->prio < cpu_rq(target)->rt.highest_prio.curr))
- cpu = target;
+ /*
+ * Check once for losing a race with the other core's irq handler.
+ * This does not happen frequently, but it can avoid delaying
+ * the execution of the RT task in those cases.
+ */
+ if (target != -1) {
+ tgt_task = READ_ONCE(cpu_rq(target)->curr);
+ if (task_may_not_preempt(tgt_task, target))
+ target = find_lowest_rq(p, sync);
}
+ /*
+ * Possible race. Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
+ */
+ if (target != -1 &&
+ (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr))
+ cpu = target;
+ *per_cpu_ptr(&incoming_rt_task, cpu) = true;
rcu_read_unlock();
-
out:
/*
* If previous CPU was different, make sure to cancel any active
@@ -1730,7 +1736,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
requeue_task_rt(rq, p, 1);
resched_curr(rq);
}
-
#endif /* CONFIG_SMP */
/*
@@ -1994,12 +1999,108 @@ retry:
}
#endif /* CONFIG_SCHED_HMP */
-static int find_lowest_rq(struct task_struct *task)
+static int find_best_rt_target(struct task_struct* task, int cpu,
+ struct cpumask* lowest_mask,
+ bool boosted, bool prefer_idle) {
+ int iter_cpu;
+ int target_cpu = -1;
+ int boosted_cpu = -1;
+ int backup_cpu = -1;
+ int boosted_orig_capacity = capacity_orig_of(0);
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ unsigned long target_util = 0;
+ unsigned long new_util;
+ /* We want to elect the best one based on task class,
+ * idleness, and utilization.
+ */
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(task)))
+ continue;
+
+ new_util = cpu_util(i) + task_util(task);
+
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle
+ && cpumask_test_cpu(i, lowest_mask) && best_idle_cpu < 0) {
+ best_idle_cpu = i;
+ continue;
+ }
+
+ if (cpumask_test_cpu(i, lowest_mask)) {
+ /* Bias cpu selection towards cpu with higher original
+ * capacity if task is boosted.
+ * Assumption: Higher cpus are exclusively alloted for
+ * boosted tasks.
+ */
+ if (boosted && boosted_cpu < 0
+ && boosted_orig_capacity < capacity_orig_of(i)) {
+ boosted_cpu = i;
+ boosted_orig_capacity = capacity_orig_of(i);
+ }
+ cur_capacity = capacity_curr_of(i);
+ if (new_util < cur_capacity && cpu_rq(i)->nr_running) {
+ if(!boosted) {
+ /* Find a target cpu with highest utilization.*/
+ if (target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ if (target_util == 0 || target_util > new_util) {
+ /* Find a target cpu with lowest utilization.*/
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (backup_capacity == 0 || backup_capacity < cur_capacity) {
+ /* Select a backup CPU with highest capacity.*/
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+ }
+
+ if (boosted && boosted_cpu >=0 && boosted_cpu > best_idle_cpu)
+ target_cpu = boosted_cpu;
+ else if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+
+ if (target_cpu < 0) {
+ if (backup_cpu >= 0)
+ return backup_cpu;
+
+ /* Select current cpu if it is present in the mask.*/
+ if (cpumask_test_cpu(cpu, lowest_mask))
+ return cpu;
+
+ /* Pick a random cpu from lowest_mask */
+ target_cpu = cpumask_any(lowest_mask);
+ if (target_cpu < nr_cpu_ids)
+ return target_cpu;
+ return -1;
+ }
+ return target_cpu;
+}
+
+static int find_lowest_rq(struct task_struct *task, int sync)
{
struct sched_domain *sd;
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ bool boosted, prefer_idle;
#ifdef CONFIG_SCHED_HMP
return find_lowest_rq_hmp(task);
@@ -2012,64 +2113,88 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
+ /* Constructing cpumask of lowest priorities */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */
- /*
- * At this point we have built a mask of cpus representing the
- * lowest priority tasks in the system. Now we want to elect
- * the best one based on our affinity and topology.
- *
- * We prioritize the last cpu that the task executed on since
- * it is most likely cache-hot in that location.
+ /* Return current cpu if WF_SYNC hint is set and present in
+ * lowest_mask. Improves data locality.
*/
- if (cpumask_test_cpu(cpu, lowest_mask))
- return cpu;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(task), lowest_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
/*
- * Otherwise, we consult the sched_domains span maps to figure
- * out which cpu is logically closest to our hot cache data.
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system.
*/
- if (!cpumask_test_cpu(this_cpu, lowest_mask))
- this_cpu = -1; /* Skip this_cpu opt if not among lowest */
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- if (sd->flags & SD_WAKE_AFFINE) {
- int best_cpu;
-
- /*
- * "this_cpu" is cheaper to preempt than a
- * remote processor.
- */
- if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
- rcu_read_unlock();
- return this_cpu;
- }
+ boosted = schedtune_task_boost(task) > 0;
+ prefer_idle = schedtune_prefer_idle(task) > 0;
+ if(boosted || prefer_idle) {
+ return find_best_rt_target(task, cpu, lowest_mask, boosted, prefer_idle);
+ } else {
+ /* Now we want to elect the best one based on on our affinity
+ * and topology.
+ * We prioritize the last cpu that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ struct task_struct* curr;
+ if (!cpumask_test_cpu(this_cpu, lowest_mask))
+ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+ /*
+ * "this_cpu" is cheaper to preempt than a
+ * remote processor.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ curr = cpu_rq(this_cpu)->curr;
+ /* Ensuring that boosted/prefer idle
+ * tasks are not pre-empted even if low
+ * priority*/
+ if (!curr || (schedtune_task_boost(curr) == 0
+ && schedtune_prefer_idle(curr) == 0)) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+ }
- best_cpu = cpumask_first_and(lowest_mask,
- sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids) {
- rcu_read_unlock();
- return best_cpu;
+ best_cpu = cpumask_first_and(lowest_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
+ curr = cpu_rq(best_cpu)->curr;
+ /* Ensuring that boosted/prefer idle
+ * tasks are not pre-empted even if low
+ * priority*/
+ if(!curr || (schedtune_task_boost(curr) == 0
+ && schedtune_prefer_idle(curr) == 0)) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
}
}
- }
- rcu_read_unlock();
+ rcu_read_unlock();
- /*
- * And finally, if there were no matches within the domains
- * just give the caller *something* to work with from the compatible
- * locations.
- */
- if (this_cpu != -1)
- return this_cpu;
+ /* And finally, if there were no matches within the domains just
+ * give the caller *something* to work with from the compatible
+ * locations.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
- cpu = cpumask_any(lowest_mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- return -1;
+ cpu = cpumask_any(lowest_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ return -1;
+ }
}
/* Will lock the rq it finds */
@@ -2080,7 +2205,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
int cpu;
for (tries = 0; tries < RT_MAX_TRIES; tries++) {
- cpu = find_lowest_rq(task);
+ cpu = find_lowest_rq(task, 0);
if ((cpu == -1) || (cpu == rq->cpu))
break;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 78ba150f2016..fa4d0ab014b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -33,8 +33,10 @@ extern long calc_load_fold_active(struct rq *this_rq);
#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
#else
static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
#endif
/*
@@ -1470,7 +1472,6 @@ static inline bool is_short_burst_task(struct task_struct *p)
p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
}
-extern void check_for_migration(struct rq *rq, struct task_struct *p);
extern void pre_big_task_count_change(const struct cpumask *cpus);
extern void post_big_task_count_change(const struct cpumask *cpus);
extern void set_hmp_defaults(void);
@@ -1730,7 +1731,6 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
return 1;
}
-static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
static inline void pre_big_task_count_change(void) { }
static inline void post_big_task_count_change(void) { }
static inline void set_hmp_defaults(void) { }
@@ -2357,6 +2357,26 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /*
+ * Multiplied with any capacity value, this scale factor will return
+ * 0, which represents an un-capped state
+ */
+ return 0;
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -2383,6 +2403,19 @@ extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int walt_ravg_window;
extern bool walt_disabled;
+static inline unsigned long task_util(struct task_struct *p)
+{
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
+ return p->se.avg.util_avg;
+}
+
+
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d0ef97f484b1..728553403c2b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -17,8 +17,11 @@ bool schedtune_initialized = false;
unsigned int sysctl_sched_cfs_boost __read_mostly;
+/* We hold schedtune boost in effect for at least this long */
+#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
+
extern struct reciprocal_value schedtune_spc_rdiv;
-extern struct target_nrg schedtune_target_nrg;
+struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
static int perf_boost_idx;
@@ -260,11 +263,14 @@ struct boost_groups {
/* Maximum boost value for all RUNNABLE tasks on a CPU */
bool idle;
int boost_max;
+ u64 boost_ts;
struct {
/* The boost for tasks on that boost group */
int boost;
/* Count of RUNNABLE tasks on that boost group */
unsigned tasks;
+ /* Timestamp of boost activation */
+ u64 ts;
} group[BOOSTGROUPS_COUNT];
/* CPU's boost group locking */
raw_spinlock_t lock;
@@ -388,32 +394,52 @@ static inline void init_sched_boost(struct schedtune *st) { }
#endif /* CONFIG_SCHED_HMP */
+static inline bool schedtune_boost_timeout(u64 now, u64 ts)
+{
+ return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
+}
+
+static inline bool
+schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
+{
+ if (bg->group[idx].tasks)
+ return true;
+
+ return !schedtune_boost_timeout(now, bg->group[idx].ts);
+}
+
static void
-schedtune_cpu_update(int cpu)
+schedtune_cpu_update(int cpu, u64 now)
{
struct boost_groups *bg;
- int boost_max;
+ u64 boost_ts = now;
+ int boost_max = INT_MIN;
int idx;
bg = &per_cpu(cpu_boost_groups, cpu);
- /* The root boost group is always active */
- boost_max = bg->group[0].boost;
- for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) {
/*
* A boost group affects a CPU only if it has
- * RUNNABLE tasks on that CPU
+ * RUNNABLE tasks on that CPU or it has hold
+ * in effect from a previous task.
*/
- if (bg->group[idx].tasks == 0)
+ if (!schedtune_boost_group_active(idx, bg, now))
+ continue;
+
+ /* this boost group is active */
+ if (boost_max > bg->group[idx].boost)
continue;
- boost_max = max(boost_max, bg->group[idx].boost);
+ boost_max = bg->group[idx].boost;
+ boost_ts = bg->group[idx].ts;
}
- /* Ensures boost_max is non-negative when all cgroup boost values
- * are neagtive. Avoids under-accounting of cpu capacity which may cause
- * task stacking and frequency spikes.*/
- boost_max = max(boost_max, 0);
+
+ /* If there are no active boost groups on the CPU, set no boost */
+ if (boost_max == INT_MIN)
+ boost_max = 0;
bg->boost_max = boost_max;
+ bg->boost_ts = boost_ts;
}
static int
@@ -423,6 +449,7 @@ schedtune_boostgroup_update(int idx, int boost)
int cur_boost_max;
int old_boost;
int cpu;
+ u64 now;
/* Update per CPU boost groups */
for_each_possible_cpu(cpu) {
@@ -439,16 +466,22 @@ schedtune_boostgroup_update(int idx, int boost)
/* Update the boost value of this boost group */
bg->group[idx].boost = boost;
- /* Check if this update increase current max */
- if (boost > cur_boost_max && bg->group[idx].tasks) {
+ now = sched_clock_cpu(cpu);
+ /*
+ * Check if this update increase current max.
+ */
+ if (boost > cur_boost_max &&
+ schedtune_boost_group_active(idx, bg, now)) {
bg->boost_max = boost;
+ bg->boost_ts = bg->group[idx].ts;
+
trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
continue;
}
/* Check if this update has decreased current max */
if (cur_boost_max == old_boost && old_boost > boost) {
- schedtune_cpu_update(cpu);
+ schedtune_cpu_update(cpu, now);
trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
continue;
}
@@ -462,21 +495,38 @@ schedtune_boostgroup_update(int idx, int boost)
#define ENQUEUE_TASK 1
#define DEQUEUE_TASK -1
+static inline bool
+schedtune_update_timestamp(struct task_struct *p)
+{
+ if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
+ return true;
+
+ return task_has_rt_policy(p);
+}
+
static inline void
schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
{
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
int tasks = bg->group[idx].tasks + task_count;
+ u64 now;
/* Update boosted tasks count while avoiding to make it negative */
bg->group[idx].tasks = max(0, tasks);
+ /* Update timeout on enqueue */
+ if (task_count > 0) {
+ now = sched_clock_cpu(cpu);
+ if (schedtune_update_timestamp(p))
+ bg->group[idx].ts = now;
+
+ /* Boost group activation or deactivation on that RQ */
+ if (bg->group[idx].tasks == 1)
+ schedtune_cpu_update(cpu, now);
+ }
trace_sched_tune_tasks_update(p, cpu, tasks, idx,
- bg->group[idx].boost, bg->boost_max);
-
- /* Boost group activation or deactivation on that RQ */
- if (tasks == 1 || tasks == 0)
- schedtune_cpu_update(cpu);
+ bg->group[idx].boost, bg->boost_max,
+ bg->group[idx].ts);
}
/*
@@ -529,6 +579,7 @@ int schedtune_can_attach(struct cgroup_taskset *tset)
int src_bg; /* Source boost group index */
int dst_bg; /* Destination boost group index */
int tasks;
+ u64 now;
if (!unlikely(schedtune_initialized))
return 0;
@@ -574,18 +625,19 @@ int schedtune_can_attach(struct cgroup_taskset *tset)
* current boost group.
*/
+ now = sched_clock_cpu(cpu);
+
/* Move task from src to dst boost group */
tasks = bg->group[src_bg].tasks - 1;
bg->group[src_bg].tasks = max(0, tasks);
bg->group[dst_bg].tasks += 1;
+ bg->group[dst_bg].ts = now;
+
+ /* update next time someone asks */
+ bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
raw_spin_unlock(&bg->lock);
unlock_rq_of(rq, task, &irq_flags);
-
- /* Update CPU boost group */
- if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
- schedtune_cpu_update(task_cpu(task));
-
}
return 0;
@@ -666,8 +718,15 @@ void schedtune_exit_task(struct task_struct *tsk)
int schedtune_cpu_boost(int cpu)
{
struct boost_groups *bg;
+ u64 now;
bg = &per_cpu(cpu_boost_groups, cpu);
+ now = sched_clock_cpu(cpu);
+
+ /* check to see if we have a hold in effect */
+ if (schedtune_boost_timeout(now, bg->boost_ts))
+ schedtune_cpu_update(cpu, now);
+
return bg->boost_max;
}
@@ -831,6 +890,7 @@ schedtune_boostgroup_init(struct schedtune *st)
bg = &per_cpu(cpu_boost_groups, cpu);
bg->group[st->idx].boost = 0;
bg->group[st->idx].tasks = 0;
+ bg->group[st->idx].ts = 0;
}
return 0;
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 911606537808..0162ff4647b6 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -55,7 +55,7 @@ __read_mostly unsigned int walt_ravg_window =
static unsigned int sync_cpu;
static ktime_t ktime_last;
-static bool walt_ktime_suspended;
+static __read_mostly bool walt_ktime_suspended;
static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
{
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d69b77fc7cc1..a4aefd8152b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -252,7 +252,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
struct softirq_action *h;
bool in_hardirq;
__u32 deferred;
- __u32 pending;
+ __u32 pending, pending_now, pending_delay, pending_mask;
int softirq_bit;
/*
@@ -262,7 +262,21 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
*/
current->flags &= ~PF_MEMALLOC;
+ /*
+ * If this is not the ksoftirqd thread,
+ * and there is an RT task that is running or is waiting to run,
+ * delay handling the long-running softirq handlers by leaving
+ * them for the ksoftirqd thread.
+ */
+ if (current != __this_cpu_read(ksoftirqd) &&
+ cpu_has_rt_task(smp_processor_id()))
+ pending_mask = LONG_SOFTIRQ_MASK;
+ else
+ pending_mask = 0;
+
pending = local_softirq_pending();
+ pending_delay = pending & pending_mask;
+ pending_now = pending & ~pending_mask;
deferred = softirq_deferred_for_rt(pending);
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
@@ -270,14 +284,14 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
restart:
/* Reset the pending bitmask before enabling irqs */
- set_softirq_pending(deferred);
- __this_cpu_write(active_softirqs, pending);
+ __this_cpu_write(active_softirqs, pending_now);
+ set_softirq_pending(pending_delay);
local_irq_enable();
h = softirq_vec;
- while ((softirq_bit = ffs(pending))) {
+ while ((softirq_bit = ffs(pending_now))) {
unsigned int vec_nr;
int prev_count;
@@ -298,7 +312,7 @@ restart:
preempt_count_set(prev_count);
}
h++;
- pending >>= softirq_bit;
+ pending_now >>= softirq_bit;
}
__this_cpu_write(active_softirqs, 0);
@@ -307,10 +321,12 @@ restart:
pending = local_softirq_pending();
deferred = softirq_deferred_for_rt(pending);
+ pending_delay = pending & pending_mask;
+ pending_now = pending & ~pending_mask;
if (pending) {
- if (time_before(jiffies, end) && !need_resched() &&
- --max_restart)
+ if (pending_now && time_before(jiffies, end) &&
+ !need_resched() && --max_restart && !deferred)
goto restart;
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index baa82e3ab2c0..8a9ff85df78f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1566,7 +1566,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 17996354c745..a0c54d118c4e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,6 +235,10 @@ __setup("trace_clock=", set_trace_boot_clock);
static int __init set_tracepoint_printk(char *str)
{
+ /* Ignore the "tp_printk_stop_on_boot" param */
+ if (*str == '_')
+ return 0;
+
if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
tracepoint_printk = 1;
return 1;
@@ -841,10 +845,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
@@ -2537,8 +2543,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
* will point to the same string as current_trace->name.
*/
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
*iter->trace = *tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -4537,12 +4550,18 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
+static bool tracer_options_updated;
+
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
+ /* Only create trace option files after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return;
+
create_trace_option_files(tr, t);
}
@@ -4959,7 +4978,20 @@ waitagain:
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- /* don't print partial lines */
+ /*
+ * If one print_trace_line() fills entire trace_seq in one shot,
+ * trace_seq_to_user() will returns -EBUSY because save_len == 0,
+ * In this case, we need to consume it, otherwise, loop will peek
+ * this event next time, resulting in an infinite loop.
+ */
+ if (save_len == 0) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ trace_consume(iter);
+ break;
+ }
+
+ /* In other cases, don't print partial lines */
iter->seq.seq.len = save_len;
break;
}
@@ -6827,6 +6859,7 @@ static void __update_tracer_options(struct trace_array *tr)
static void update_tracer_options(struct trace_array *tr)
{
mutex_lock(&trace_types_lock);
+ tracer_options_updated = true;
__update_tracer_options(tr);
mutex_unlock(&trace_types_lock);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c00137ea939e..48c446d602f6 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -215,7 +215,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 927fd4ad5846..528dc408dcf9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -272,6 +272,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)