summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/events/core.c73
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/locking/rwsem-xadd.c35
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/cpufreq_schedutil.c154
-rw-r--r--kernel/sched/cpupri.c11
-rw-r--r--kernel/sched/fair.c408
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/walt.c8
-rw-r--r--kernel/sched/walt.h2
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/alarmtimer.c3
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace.c101
-rw-r--r--kernel/trace/trace_kprobe.c21
22 files changed, 702 insertions, 231 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 85de5094b936..c97bce6a0e0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -765,6 +765,11 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1a26ef5b7d58..1d6b0a209bc0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -375,6 +375,21 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
goto out_release;
}
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
smpboot_park_threads(cpu);
/*
@@ -508,8 +523,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- pr_warn("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7fee87daac56..87f5d841f796 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1693,7 +1693,33 @@ static int __perf_remove_from_context(void *info)
return 0;
}
-/*
+
+#ifdef CONFIG_SMP
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+ int up_ret;
+ /*
+ * CPU was offline. Bring it online so we can
+ * gracefully exit a perf context.
+ */
+ up_ret = cpu_up(event->cpu);
+ if (!up_ret)
+ /* Try the remove call once again. */
+ cpu_function_call(event->cpu, __perf_remove_from_context,
+ rep);
+ else
+ pr_err("Failed to bring up CPU: %d, ret: %d\n",
+ event->cpu, up_ret);
+}
+#else
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+}
+#endif
+
+ /*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
@@ -1728,6 +1754,9 @@ static void __ref perf_remove_from_context(struct perf_event *event,
*/
ret = cpu_function_call(event->cpu, __perf_remove_from_context,
&re);
+ if (ret == -ENXIO)
+ perf_retry_remove(event, &re);
+
return;
}
@@ -3408,22 +3437,27 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
+ event_cpu = READ_ONCE(event->oncpu);
+
if (event->state == PERF_EVENT_STATE_ACTIVE &&
- !cpu_isolated(event->oncpu)) {
+ !cpu_isolated(event_cpu)) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
+
+ if ((unsigned int)event_cpu >= nr_cpu_ids)
+ return 0;
if (!event->attr.exclude_idle ||
- !per_cpu(is_idle, event->oncpu)) {
- smp_call_function_single(event->oncpu,
+ !per_cpu(is_idle, event_cpu)) {
+ smp_call_function_single(event_cpu,
__perf_event_read, &data, 1);
ret = data.ret;
}
@@ -6566,21 +6600,6 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
- /*
- * Due to interrupt latency (AKA "skid"), we may enter the
- * kernel before taking an overflow, even if the PMU is only
- * counting user events.
- * To avoid leaking information to userspace, we must always
- * reject kernel samples when exclude_kernel is set.
- */
- if (event->attr.exclude_kernel && !user_mode(regs))
- return false;
-
- return true;
-}
-
/*
* Generic event overflow handling, sampling.
*/
@@ -6628,12 +6647,6 @@ static int __perf_event_overflow(struct perf_event *event,
}
/*
- * For security, drop the skid kernel samples if necessary.
- */
- if (!sample_is_allowed(event, regs))
- return ret;
-
- /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -7109,6 +7122,8 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
#ifdef CONFIG_EVENT_TRACING
@@ -7230,6 +7245,8 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
static inline void perf_tp_register(void)
@@ -7517,6 +7534,8 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
+
+ .events_across_hotplug = 1,
};
/*
@@ -7598,6 +7617,8 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
+
+ .events_across_hotplug = 1,
};
static void perf_pmu_nop_void(struct pmu *pmu)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..7da5b674d16e 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -614,6 +614,8 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
+
+ .events_across_hotplug = 1,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..4f06fc34313f 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index 246b8a57a32d..fef4df444f47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -832,8 +832,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode) &&
- !capable(CAP_SYS_RESOURCE)) {
+ !ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..75c950ede9c7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -511,6 +511,41 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
unsigned long flags;
/*
+ * If a spinner is present, there is a chance that the load of
+ * rwsem_has_spinner() in rwsem_wake() can be reordered with
+ * respect to decrement of rwsem count in __up_write() leading
+ * to wakeup being missed.
+ *
+ * spinning writer up_write caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * spin_lock(wait_lock)
+ * sem->count=0xFFFFFFFF00000001
+ * +0xFFFFFFFF00000000
+ * count=sem->count
+ * MB
+ * sem->count=0xFFFFFFFE00000001
+ * -0xFFFFFFFF00000001
+ * RMB
+ * spin_trylock(wait_lock)
+ * return
+ * rwsem_try_write_lock(count)
+ * spin_unlock(wait_lock)
+ * schedule()
+ *
+ * Reordering of atomic_long_sub_return_release() in __up_write()
+ * and rwsem_has_spinner() in rwsem_wake() can cause missing of
+ * wakeup in up_write() context. In spinning writer, sem->count
+ * and local variable count is 0XFFFFFFFE00000001. It would result
+ * in rwsem_try_write_lock() failing to acquire rwsem and spinning
+ * writer going to sleep in rwsem_down_write_failed().
+ *
+ * The smp_rmb() here is to make sure that the spinner state is
+ * consulted after sem->count is updated in up_write context.
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/resource.c b/kernel/resource.c
index 4c9835c09dcd..c09d484f7b5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0071785e698b..4ecca604e64b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6924,6 +6924,9 @@ enum s_alloc {
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
+ *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6935,18 +6938,31 @@ enum s_alloc {
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ const struct cpumask *sg_span = sched_group_cpus(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -9216,11 +9232,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -9602,6 +9627,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 75bfbb336722..e12309c1b07b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,7 @@ struct sugov_policy {
s64 up_rate_delay_ns;
s64 down_rate_delay_ns;
unsigned int next_freq;
+ unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
@@ -63,7 +64,6 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
- unsigned int cached_raw_freq;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;
@@ -72,6 +72,11 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -127,22 +132,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
return;
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
- sg_policy->last_freq_update_time = time;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
- sg_policy->last_freq_update_time = time;
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -150,7 +153,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -170,19 +173,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
- unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
return sg_policy->next_freq;
- sg_cpu->cached_raw_freq = freq;
+ sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -248,6 +250,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
sg_cpu->iowait_boost >>= 1;
}
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
@@ -256,6 +271,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
+ bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -263,40 +279,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
+ busy = sugov_cpu_is_busy(sg_cpu);
+
if (flags & SCHED_CPUFREQ_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(&util, &max, time);
sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_cpu, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq)
+ next_f = sg_policy->next_freq;
}
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max,
- unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
unsigned int j;
- if (flags & SCHED_CPUFREQ_DL)
- return max_f;
-
- sugov_iowait_boost(sg_cpu, &util, &max);
-
for_each_cpu(j, policy->cpus) {
- struct sugov_cpu *j_sg_cpu;
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
s64 delta_ns;
- if (j == smp_processor_id())
- continue;
-
- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -310,7 +323,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
- return max_f;
+ return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
@@ -322,7 +335,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(sg_cpu, util, max);
+ return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -345,7 +358,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ if (flags & SCHED_CPUFREQ_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu);
+
sugov_update_commit(sg_policy, time, next_f);
}
@@ -371,15 +388,15 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
/*
- * For Real Time and Deadline tasks, schedutil governor shoots the
- * frequency to maximum. And special care must be taken to ensure that
- * this kthread doesn't result in that.
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
- * updated only at the end of the sugov_work() and before that schedutil
- * rejects all other frequency scaling requests.
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
*
- * Though there is a very rare case where the RT thread yields right
+ * There is a very rare case though, where the RT thread yields right
* after the work_in_progress flag is cleared. The effects of that are
* neglected for now.
*/
@@ -489,15 +506,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
@@ -531,6 +545,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
sg_policy->thread = thread;
kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
wake_up_process(thread);
return 0;
@@ -544,6 +561,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy)
flush_kthread_worker(&sg_policy->worker);
kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
}
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
@@ -578,9 +596,13 @@ static int sugov_init(struct cpufreq_policy *policy)
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
ret = sugov_kthread_create(sg_policy);
if (ret)
@@ -623,13 +645,11 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
@@ -640,6 +660,10 @@ free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -650,8 +674,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -664,6 +686,7 @@ static int sugov_exit(struct cpufreq_policy *policy)
sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
return 0;
}
@@ -681,25 +704,19 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- if (policy_is_shared(policy)) {
- sg_cpu->util = 0;
- sg_cpu->max = 0;
- sg_cpu->flags = SCHED_CPUFREQ_DL;
- sg_cpu->last_update = 0;
- sg_cpu->cached_raw_freq = 0;
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_shared);
- } else {
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_single);
- }
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
}
return 0;
}
@@ -714,9 +731,10 @@ static int sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- kthread_cancel_work_sync(&sg_policy->work);
-
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
return 0;
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1d00cf8c00fa..14225d5d8617 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -279,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp)
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
+
+/*
+ * cpupri_check_rt - check if CPU has a RT task
+ * should be called from rcu-sched read section.
+ */
+bool cpupri_check_rt(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL;
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 422438d43d90..853064319b0d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -35,6 +35,8 @@
#include "sched.h"
#include <trace/events/sched.h>
#include "tune.h"
+#include "walt.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -6548,9 +6550,11 @@ static int find_new_capacity(struct energy_env *eenv,
return idx;
}
-static int group_idle_state(struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
{
int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
/* Find the shallowest idle state in the sched group. */
for_each_cpu(i, sched_group_cpus(sg))
@@ -6559,6 +6563,54 @@ static int group_idle_state(struct sched_group *sg)
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg))
+ grp_util += cpu_util(i);
+
+ src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+ /* add or remove util as appropriate to indicate what group util
+ * will be (worst case - no concurrent execution) after moving the task
+ */
+ grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
return state;
}
@@ -6631,8 +6683,9 @@ static int sched_group_energy(struct energy_env *eenv)
}
}
- idle_idx = group_idle_state(sg);
+ idle_idx = group_idle_state(eenv, sg);
group_util = group_norm_util(eenv, sg);
+
sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
>> SCHED_CAPACITY_SHIFT;
sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
@@ -7337,48 +7390,59 @@ static int start_cpu(bool boosted)
return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
}
-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
{
- int target_cpu = -1;
- unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
- unsigned long backup_capacity = ULONG_MAX;
- int best_idle_cpu = -1;
- int best_idle_cstate = INT_MAX;
- int backup_cpu = -1;
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long target_util = ULONG_MAX;
+ unsigned long best_active_util = ULONG_MAX;
+ int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
struct sched_group *sg;
- int cpu = start_cpu(boosted);
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
+
+ *backup_cpu = -1;
schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
schedstat_inc(this_rq(), eas_stats.fbt_attempts);
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
if (cpu < 0) {
schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
- return target_cpu;
+ return -1;
}
+ /* Find SD for the start CPU */
sd = rcu_dereference(per_cpu(sd_ea, cpu));
-
if (!sd) {
schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
- return target_cpu;
+ return -1;
}
+ /* Scan CPUs in all SDs */
sg = sd->groups;
-
do {
- int i;
-
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- unsigned long cur_capacity, new_util, wake_util;
- unsigned long min_wake_util = ULONG_MAX;
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util;
if (!cpu_online(i))
continue;
+ if (walt_cpu_high_irqload(i))
+ continue;
+
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
@@ -7393,65 +7457,204 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
-
- if (new_util > capacity_orig_of(i))
+ if (new_util > capacity_orig)
continue;
/*
- * Unconditionally favoring tasks that prefer idle cpus to
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
* improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
*/
- if (idle_cpu(i) && prefer_idle) {
- schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
- schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
- return i;
- }
+ if (prefer_idle) {
- cur_capacity = capacity_curr_of(i);
-
- if (new_util < cur_capacity) {
- if (cpu_rq(i)->nr_running) {
- /*
- * Find a target cpu with the lowest/highest
- * utilization if prefer_idle/!prefer_idle.
- */
- if (prefer_idle) {
- /* Favor the CPU that last ran the task */
- if (new_util > target_util ||
- wake_util > min_wake_util)
- continue;
- min_wake_util = wake_util;
- target_util = new_util;
- target_cpu = i;
- } else if (target_util < new_util) {
- target_util = new_util;
- target_cpu = i;
- }
- } else if (!prefer_idle) {
- int idle_idx = idle_get_state_idx(cpu_rq(i));
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
- if (best_idle_cpu < 0 ||
- (sysctl_sched_cstate_aware &&
- best_idle_cstate > idle_idx)) {
- best_idle_cstate = idle_idx;
- best_idle_cpu = i;
- }
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
}
- } else if (backup_capacity > cur_capacity) {
- /* Find a backup cpu with least capacity. */
- backup_capacity = cur_capacity;
- backup_cpu = i;
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
}
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - new_util) < target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - new_util;
+ target_capacity = capacity_orig;
+ target_util = new_util;
+ target_cpu = i;
}
+
} while (sg = sg->next, sg != sd->groups);
- if (target_cpu < 0)
- target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
- if (target_cpu >= 0) {
- schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
- schedstat_inc(this_rq(), eas_stats.fbt_count);
- }
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq(), eas_stats.fbt_count);
return target_cpu;
}
@@ -7483,7 +7686,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
{
struct sched_domain *sd;
- int target_cpu = prev_cpu, tmp_target;
+ int target_cpu = prev_cpu, tmp_target, tmp_backup;
bool boosted, prefer_idle;
schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
@@ -7508,9 +7711,11 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
prefer_idle = 0;
#endif
+ sync_entity_load_avg(&p->se);
+
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
/* Find a cpu with sufficient capacity */
- tmp_target = find_best_target(p, boosted, prefer_idle);
+ tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
if (!sd)
goto unlock;
@@ -7539,10 +7744,15 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
}
if (energy_diff(&eenv) >= 0) {
- schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
- schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
- target_cpu = prev_cpu;
- goto unlock;
+ /* No energy saving for target_cpu, try backup */
+ target_cpu = tmp_backup;
+ eenv.dst_cpu = target_cpu;
+ if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
}
schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
@@ -7584,16 +7794,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return select_best_cpu(p, prev_cpu, 0, sync);
#endif
- if (sd_flag & SD_BALANCE_WAKE) {
- /*
- * do wake_cap unconditionally as it causes task and cpu
- * utilization to be synced, and we need that for energy
- * aware wakeups
- */
- int _wake_cap = wake_cap(p, cpu, prev_cpu);
- want_affine = !wake_wide(p) && !_wake_cap
+ if (sd_flag & SD_BALANCE_WAKE)
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
- }
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu, sync);
@@ -9189,6 +9392,38 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -9220,6 +9455,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_isolated(i))
continue;
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -10680,11 +10921,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
#ifdef CONFIG_SCHED_HMP
static inline int find_new_hmp_ilb(int type)
@@ -11111,6 +11347,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
(!energy_aware() || cpu_overutilized(cpu)))
return true;
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware() && rq->misfit_task)
+ return 1;
+
return (rq->nr_running >= 2);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c03d51a017bf..ee095f4e7230 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1823,6 +1823,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
* the best one based on our affinity and topology.
*/
+retry:
for_each_sched_cluster(cluster) {
if (boost_on_big && cluster->capacity != max_possible_capacity)
continue;
@@ -1830,6 +1831,15 @@ static int find_lowest_rq_hmp(struct task_struct *task)
cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
cpumask_andnot(&candidate_mask, &candidate_mask,
cpu_isolated_mask);
+ /*
+ * When placement boost is active, if there is no eligible CPU
+ * in the highest capacity cluster, we fallback to the other
+ * clusters. So clear the CPUs of the traversed cluster from
+ * the lowest_mask.
+ */
+ if (unlikely(boost_on_big))
+ cpumask_andnot(lowest_mask, lowest_mask,
+ &cluster->cpus);
if (cpumask_empty(&candidate_mask))
continue;
@@ -1869,6 +1879,11 @@ static int find_lowest_rq_hmp(struct task_struct *task)
break;
}
+ if (unlikely(boost_on_big && best_cpu == -1)) {
+ boost_on_big = 0;
+ goto retry;
+ }
+
return best_cpu;
}
#endif /* CONFIG_SCHED_HMP */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 6e053bd9830c..92c3aae8e056 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -72,7 +72,15 @@ static cpumask_t mpc_mask = CPU_MASK_ALL;
__read_mostly unsigned int walt_ravg_window = 20000000;
/* Min window size (in ns) = 10ms */
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
+#else
#define MIN_SCHED_RAVG_WINDOW 10000000
+#endif
/* Max window size (in ns) = 1s */
#define MAX_SCHED_RAVG_WINDOW 1000000000
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index e181c87a928d..f56c4da16d0b 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -55,6 +55,8 @@ static inline void walt_migrate_sync_cpu(int cpu) { }
static inline void walt_init_cpu_efficiency(void) { }
static inline u64 walt_ktime_clock(void) { return 0; }
+#define walt_cpu_high_irqload(cpu) false
+
#endif /* CONFIG_SCHED_WALT */
extern unsigned int walt_disabled;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 39ffd41594ce..9029227e5f57 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -234,6 +234,8 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
+#define long_softirq_pending() (local_softirq_pending() & LONG_SOFTIRQ_MASK)
+#define defer_for_rt() (long_softirq_pending() && cpupri_check_rt())
asmlinkage __visible void __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
@@ -297,6 +299,7 @@ restart:
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
+ !defer_for_rt() &&
--max_restart)
goto restart;
@@ -349,7 +352,7 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (!force_irqthreads) {
+ if (!force_irqthreads && !defer_for_rt()) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f27d2ba78d14..8576e6385d63 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2380,9 +2380,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1a4de0022cc5..ceec77c652b5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -848,7 +848,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
* Rate limit to the tick as a hot fix to prevent DOS. Will be
* mopped up later.
*/
- if (ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
+ if (timr->it.alarm.interval.tv64 &&
+ ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
timr->it.alarm.interval = ktime_set(0, TICK_NSEC);
exp = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ec2102104cb8..333f627a3a3b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -896,6 +896,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34b2a0d5cf1a..eba904bae48c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3535,7 +3535,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60d246c4eefa..a579a874045b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1362,7 +1362,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
- unsigned *saved_tgids;
+ unsigned *map_cmdline_to_tgid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
@@ -1396,9 +1396,10 @@ static int allocate_cmdlines_buffer(unsigned int val,
return -ENOMEM;
}
- s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids),
- GFP_KERNEL);
- if (!s->saved_tgids) {
+ s->map_cmdline_to_tgid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_tgid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_tgid) {
kfree(s->map_cmdline_to_pid);
kfree(s->saved_cmdlines);
return -ENOMEM;
@@ -1410,8 +1411,8 @@ static int allocate_cmdlines_buffer(unsigned int val,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- memset(s->saved_tgids, 0,
- val * sizeof(*s->saved_tgids));
+ memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_tgid));
return 0;
}
@@ -1577,14 +1578,17 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
+ preempt_disable();
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!arch_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock)) {
+ preempt_enable();
return 0;
+ }
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1607,8 +1611,9 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
- savedcmd->saved_tgids[idx] = tsk->tgid;
+ savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return 1;
}
@@ -1650,19 +1655,29 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
-int trace_find_tgid(int pid)
+static int __find_tgid_locked(int pid)
{
unsigned map;
int tgid;
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
- tgid = savedcmd->saved_tgids[map];
+ tgid = savedcmd->map_cmdline_to_tgid[map];
else
tgid = -1;
+ return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ tgid = __find_tgid_locked(pid);
+
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -3979,10 +3994,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
{
char buf[64];
int r;
+ unsigned int n;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ n = savedcmd->cmdline_num;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ r = scnprintf(buf, sizeof(buf), "%u\n", n);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -3991,7 +4011,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
- kfree(s->saved_tgids);
+ kfree(s->map_cmdline_to_tgid);
kfree(s);
}
@@ -4008,10 +4028,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -4230,33 +4252,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf,
char *file_buf;
char *buf;
int len = 0;
- int pid;
int i;
+ int *pids;
+ int n = 0;
- file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL);
- if (!file_buf)
- return -ENOMEM;
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
- buf = file_buf;
+ pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+ if (!pids) {
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ return -ENOMEM;
+ }
for (i = 0; i < savedcmd->cmdline_num; i++) {
- int tgid;
- int r;
+ int pid;
pid = savedcmd->map_cmdline_to_pid[i];
if (pid == -1 || pid == NO_CMDLINE_MAP)
continue;
- tgid = trace_find_tgid(pid);
- r = sprintf(buf, "%d %d\n", pid, tgid);
+ pids[n] = pid;
+ pids[n+1] = __find_tgid_locked(pid);
+ n += 2;
+ }
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ if (n == 0) {
+ kfree(pids);
+ return 0;
+ }
+
+ /* enough to hold max pair of pids + space, lr and nul */
+ len = n * 12;
+ file_buf = kmalloc(len, GFP_KERNEL);
+ if (!file_buf) {
+ kfree(pids);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+ for (i = 0; i < n && len > 0; i += 2) {
+ int r;
+
+ r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
buf += r;
- len += r;
+ len -= r;
}
len = simple_read_from_buffer(ubuf, cnt, ppos,
- file_buf, len);
+ file_buf, buf - file_buf);
kfree(file_buf);
+ kfree(pids);
return len;
}
@@ -6847,6 +6897,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12ea4ea619ee..e9092a0247bf 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -659,30 +659,25 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return) {
pr_info("Return probe must be used without offset.\n");
return -EINVAL;
}
+ } else if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
}
argc -= 2; argv += 2;