diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/verifier.c | 5 | ||||
| -rw-r--r-- | kernel/cpu.c | 19 | ||||
| -rw-r--r-- | kernel/events/core.c | 73 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint.c | 2 | ||||
| -rw-r--r-- | kernel/extable.c | 2 | ||||
| -rw-r--r-- | kernel/fork.c | 3 | ||||
| -rw-r--r-- | kernel/locking/rwsem-xadd.c | 35 | ||||
| -rw-r--r-- | kernel/resource.c | 13 | ||||
| -rw-r--r-- | kernel/sched/core.c | 36 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 154 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 11 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 408 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 15 | ||||
| -rw-r--r-- | kernel/sched/walt.c | 8 | ||||
| -rw-r--r-- | kernel/sched/walt.h | 2 | ||||
| -rw-r--r-- | kernel/softirq.c | 5 | ||||
| -rw-r--r-- | kernel/sysctl.c | 3 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 3 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 12 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 101 | ||||
| -rw-r--r-- | kernel/trace/trace_kprobe.c | 21 |
22 files changed, 702 insertions, 231 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85de5094b936..c97bce6a0e0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -765,6 +765,11 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); diff --git a/kernel/cpu.c b/kernel/cpu.c index 1a26ef5b7d58..1d6b0a209bc0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -375,6 +375,21 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + smpboot_park_threads(cpu); /* @@ -508,8 +523,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee87daac56..87f5d841f796 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1693,7 +1693,33 @@ static int __perf_remove_from_context(void *info) return 0; } -/* + +#ifdef CONFIG_SMP +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ + int up_ret; + /* + * CPU was offline. Bring it online so we can + * gracefully exit a perf context. + */ + up_ret = cpu_up(event->cpu); + if (!up_ret) + /* Try the remove call once again. */ + cpu_function_call(event->cpu, __perf_remove_from_context, + rep); + else + pr_err("Failed to bring up CPU: %d, ret: %d\n", + event->cpu, up_ret); +} +#else +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ +} +#endif + + /* * Remove the event from a task's (or a CPU's) list of events. * * CPU events are removed with a smp call. For task events we only @@ -1728,6 +1754,9 @@ static void __ref perf_remove_from_context(struct perf_event *event, */ ret = cpu_function_call(event->cpu, __perf_remove_from_context, &re); + if (ret == -ENXIO) + perf_retry_remove(event, &re); + return; } @@ -3408,22 +3437,27 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ + event_cpu = READ_ONCE(event->oncpu); + if (event->state == PERF_EVENT_STATE_ACTIVE && - !cpu_isolated(event->oncpu)) { + !cpu_isolated(event_cpu)) { struct perf_read_data data = { .event = event, .group = group, .ret = 0, }; + + if ((unsigned int)event_cpu >= nr_cpu_ids) + return 0; if (!event->attr.exclude_idle || - !per_cpu(is_idle, event->oncpu)) { - smp_call_function_single(event->oncpu, + !per_cpu(is_idle, event_cpu)) { + smp_call_function_single(event_cpu, __perf_event_read, &data, 1); ret = data.ret; } @@ -6566,21 +6600,6 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -6628,12 +6647,6 @@ static int __perf_event_overflow(struct perf_event *event, } /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - - /* * XXX event_limit might not quite work as expected on inherited * events */ @@ -7109,6 +7122,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; #ifdef CONFIG_EVENT_TRACING @@ -7230,6 +7245,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; static inline void perf_tp_register(void) @@ -7517,6 +7534,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .events_across_hotplug = 1, }; /* @@ -7598,6 +7617,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .events_across_hotplug = 1, }; static void perf_pmu_nop_void(struct pmu *pmu) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..7da5b674d16e 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -614,6 +614,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .events_across_hotplug = 1, }; int __init init_hw_breakpoint(void) diff --git a/kernel/extable.c b/kernel/extable.c index e820ccee9846..4f06fc34313f 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) diff --git a/kernel/fork.c b/kernel/fork.c index 246b8a57a32d..fef4df444f47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -832,8 +832,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode) && - !capable(CAP_SYS_RESOURCE)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..75c950ede9c7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -511,6 +511,41 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; /* + * If a spinner is present, there is a chance that the load of + * rwsem_has_spinner() in rwsem_wake() can be reordered with + * respect to decrement of rwsem count in __up_write() leading + * to wakeup being missed. + * + * spinning writer up_write caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * spin_lock(wait_lock) + * sem->count=0xFFFFFFFF00000001 + * +0xFFFFFFFF00000000 + * count=sem->count + * MB + * sem->count=0xFFFFFFFE00000001 + * -0xFFFFFFFF00000001 + * RMB + * spin_trylock(wait_lock) + * return + * rwsem_try_write_lock(count) + * spin_unlock(wait_lock) + * schedule() + * + * Reordering of atomic_long_sub_return_release() in __up_write() + * and rwsem_has_spinner() in rwsem_wake() can cause missing of + * wakeup in up_write() context. In spinning writer, sem->count + * and local variable count is 0XFFFFFFFE00000001. It would result + * in rwsem_try_write_lock() failing to acquire rwsem and spinning + * writer going to sleep in rwsem_down_write_failed(). + * + * The smp_rmb() here is to make sure that the spinner state is + * consulted after sem->count is updated in up_write context. + */ + smp_rmb(); + + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize * spinlock contention which may introduce too much delay in the diff --git a/kernel/resource.c b/kernel/resource.c index 4c9835c09dcd..c09d484f7b5f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; + unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; + + if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { + start = r->start; + end = r->end; + } else { + start = end = 0; + } + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, + width, start, + width, end, r->name ? r->name : "<BAD>"); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0071785e698b..4ecca604e64b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6924,6 +6924,9 @@ enum s_alloc { * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * + * Only CPUs that can arrive at this group should be considered to continue + * balancing. + * * Asymmetric node setups can result in situations where the domain tree is of * unequal depth, make sure to skip domains that already cover the entire * range. @@ -6935,18 +6938,31 @@ enum s_alloc { */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) + continue; + + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); } /* @@ -9216,11 +9232,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -9602,6 +9627,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 75bfbb336722..e12309c1b07b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,6 +47,7 @@ struct sugov_policy { s64 up_rate_delay_ns; s64 down_rate_delay_ns; unsigned int next_freq; + unsigned int cached_raw_freq; /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; @@ -63,7 +64,6 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; - unsigned int cached_raw_freq; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -72,6 +72,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -127,22 +132,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) return; + if (sg_policy->next_freq == next_freq) + return; + + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -150,7 +153,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -170,19 +173,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq (as calculated above) is returned, subject to policy min/max and * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, - unsigned long max) +static unsigned int get_next_freq(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) return sg_policy->next_freq; - sg_cpu->cached_raw_freq = freq; + sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -248,6 +250,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -256,6 +271,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -263,40 +279,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max, time); sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_cpu, util, max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -310,7 +323,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -322,7 +335,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, sugov_iowait_boost(j_sg_cpu, &util, &max); } - return get_next_freq(sg_cpu, util, max); + return get_next_freq(sg_policy, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -345,7 +358,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } @@ -371,15 +388,15 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For Real Time and Deadline tasks, schedutil governor shoots the - * frequency to maximum. And special care must be taken to ensure that - * this kthread doesn't result in that. + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() and before that schedutil - * rejects all other frequency scaling requests. + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. * - * Though there is a very rare case where the RT thread yields right + * There is a very rare case though, where the RT thread yields right * after the work_in_progress flag is cleared. The effects of that are * neglected for now. */ @@ -489,15 +506,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } @@ -531,6 +545,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) sg_policy->thread = thread; kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + wake_up_process(thread); return 0; @@ -544,6 +561,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy) flush_kthread_worker(&sg_policy->worker); kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); } static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -578,9 +596,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } ret = sugov_kthread_create(sg_policy); if (ret) @@ -623,13 +645,11 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); @@ -640,6 +660,10 @@ free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -650,8 +674,6 @@ static int sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -664,6 +686,7 @@ static int sugov_exit(struct cpufreq_policy *policy) sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); return 0; } @@ -681,25 +704,19 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->sg_policy = sg_policy; - if (policy_is_shared(policy)) { - sg_cpu->util = 0; - sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_DL; - sg_cpu->last_update = 0; - sg_cpu->cached_raw_freq = 0; - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_shared); - } else { - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_single); - } + sg_cpu->flags = SCHED_CPUFREQ_DL; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); } return 0; } @@ -714,9 +731,10 @@ static int sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - kthread_cancel_work_sync(&sg_policy->work); - + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } return 0; } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1d00cf8c00fa..14225d5d8617 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -279,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } + +/* + * cpupri_check_rt - check if CPU has a RT task + * should be called from rcu-sched read section. + */ +bool cpupri_check_rt(void) +{ + int cpu = raw_smp_processor_id(); + + return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL; +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 422438d43d90..853064319b0d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -35,6 +35,8 @@ #include "sched.h" #include <trace/events/sched.h> #include "tune.h" +#include "walt.h" + /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -6548,9 +6550,11 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } -static int group_idle_state(struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) { int i, state = INT_MAX; + int src_in_grp, dst_in_grp; + long grp_util = 0; /* Find the shallowest idle state in the sched group. */ for_each_cpu(i, sched_group_cpus(sg)) @@ -6559,6 +6563,54 @@ static int group_idle_state(struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_cpus(sg)) + grp_util += cpu_util(i); + + src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + if (src_in_grp == dst_in_grp) { + /* both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + goto end; + } + /* add or remove util as appropriate to indicate what group util + * will be (worst case - no concurrent execution) after moving the task + */ + grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta; + + if (grp_util <= + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + int max_idle_state_idx = sg->sge->nr_idle_states - 2; + int new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + else { + /* for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min(max_idle_state_idx, (int) + (new_state / sg->sgc->max_capacity)); + new_state = max_idle_state_idx - new_state; + } + state = new_state; + } else { + /* After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + state = 0; + } +end: return state; } @@ -6631,8 +6683,9 @@ static int sched_group_energy(struct energy_env *eenv) } } - idle_idx = group_idle_state(sg); + idle_idx = group_idle_state(eenv, sg); group_util = group_norm_util(eenv, sg); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) @@ -7337,48 +7390,59 @@ static int start_cpu(bool boosted) return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) { - int target_cpu = -1; - unsigned long target_util = prefer_idle ? ULONG_MAX : 0; - unsigned long backup_capacity = ULONG_MAX; - int best_idle_cpu = -1; - int best_idle_cstate = INT_MAX; - int backup_cpu = -1; + unsigned long best_idle_min_cap_orig = ULONG_MAX; unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + unsigned long best_active_util = ULONG_MAX; + int best_idle_cstate = INT_MAX; struct sched_domain *sd; struct sched_group *sg; - int cpu = start_cpu(boosted); + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; + + *backup_cpu = -1; schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); schedstat_inc(this_rq(), eas_stats.fbt_attempts); + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); if (cpu < 0) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); - return target_cpu; + return -1; } + /* Find SD for the start CPU */ sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!sd) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); schedstat_inc(this_rq(), eas_stats.fbt_no_sd); - return target_cpu; + return -1; } + /* Scan CPUs in all SDs */ sg = sd->groups; - do { - int i; - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - unsigned long cur_capacity, new_util, wake_util; - unsigned long min_wake_util = ULONG_MAX; + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; if (!cpu_online(i)) continue; + if (walt_cpu_high_irqload(i)) + continue; + /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double @@ -7393,65 +7457,204 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * than the one required to boost the task. */ new_util = max(min_util, new_util); - - if (new_util > capacity_orig_of(i)) + if (new_util > capacity_orig) continue; /* - * Unconditionally favoring tasks that prefer idle cpus to + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. */ - if (idle_cpu(i) && prefer_idle) { - schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); - schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); - return i; - } + if (prefer_idle) { - cur_capacity = capacity_curr_of(i); - - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - /* - * Find a target cpu with the lowest/highest - * utilization if prefer_idle/!prefer_idle. - */ - if (prefer_idle) { - /* Favor the CPU that last ran the task */ - if (new_util > target_util || - wake_util > min_wake_util) - continue; - min_wake_util = wake_util; - target_util = new_util; - target_cpu = i; - } else if (target_util < new_util) { - target_util = new_util; - target_cpu = i; - } - } else if (!prefer_idle) { - int idle_idx = idle_get_state_idx(cpu_rq(i)); + /* + * Case A.1: IDLE CPU + * Return the first IDLE CPU we find. + */ + if (idle_cpu(i)) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - best_idle_cpu = i; - } + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); + + return i; } - } else if (backup_capacity > cur_capacity) { - /* Find a backup cpu with least capacity. */ - backup_capacity = cur_capacity; - backup_cpu = i; + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if ((capacity_curr > new_util) && + (capacity_orig - new_util > target_max_spare_cap)) { + target_max_spare_cap = capacity_orig - new_util; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; } + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) + continue; + + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + /* Keep track of best idle CPU */ + best_idle_min_cap_orig = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) + continue; + + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - new_util) < target_max_spare_cap) + continue; + + target_max_spare_cap = capacity_orig - new_util; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; } + } while (sg = sg->next, sg != sd->groups); - if (target_cpu < 0) - target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; - if (target_cpu >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); - schedstat_inc(this_rq(), eas_stats.fbt_count); - } + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); return target_cpu; } @@ -7483,7 +7686,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { struct sched_domain *sd; - int target_cpu = prev_cpu, tmp_target; + int target_cpu = prev_cpu, tmp_target, tmp_backup; bool boosted, prefer_idle; schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); @@ -7508,9 +7711,11 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync prefer_idle = 0; #endif + sync_entity_load_avg(&p->se); + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); /* Find a cpu with sufficient capacity */ - tmp_target = find_best_target(p, boosted, prefer_idle); + tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); if (!sd) goto unlock; @@ -7539,10 +7744,15 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } if (energy_diff(&eenv) >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - goto unlock; + /* No energy saving for target_cpu, try backup */ + target_cpu = tmp_backup; + eenv.dst_cpu = target_cpu; + if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } } schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); @@ -7584,16 +7794,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return select_best_cpu(p, prev_cpu, 0, sync); #endif - if (sd_flag & SD_BALANCE_WAKE) { - /* - * do wake_cap unconditionally as it causes task and cpu - * utilization to be synced, and we need that for energy - * aware wakeups - */ - int _wake_cap = wake_cap(p, cpu, prev_cpu); - want_affine = !wake_wide(p) && !_wake_cap + if (sd_flag & SD_BALANCE_WAKE) + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); - } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); @@ -9189,6 +9392,38 @@ group_type group_classify(struct sched_group *group, return group_other; } +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing data + * - used by the nohz balance, but we want it available here + * so that we can see which CPUs have no tick. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +static inline void update_cpu_stats_if_tickless(struct rq *rq) +{ + /* only called from update_sg_lb_stats when irqs are disabled */ + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) { + /* rate limit updates to once-per-jiffie at most */ + if (READ_ONCE(jiffies) <= rq->last_load_update_tick) + return; + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false); + raw_spin_unlock(&rq->lock); + } +} + +#else +static inline void update_cpu_stats_if_tickless(struct rq *rq) { } +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -9220,6 +9455,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (cpu_isolated(i)) continue; + /* if we are entering idle and there are CPUs with + * their tick stopped, do an update for them + */ + if (env->idle == CPU_NEWLY_IDLE) + update_cpu_stats_if_tickless(rq); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -10680,11 +10921,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; #ifdef CONFIG_SCHED_HMP static inline int find_new_hmp_ilb(int type) @@ -11111,6 +11347,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) (!energy_aware() || cpu_overutilized(cpu))) return true; + /* Do idle load balance if there have misfit task */ + if (energy_aware() && rq->misfit_task) + return 1; + return (rq->nr_running >= 2); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c03d51a017bf..ee095f4e7230 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1823,6 +1823,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) * the best one based on our affinity and topology. */ +retry: for_each_sched_cluster(cluster) { if (boost_on_big && cluster->capacity != max_possible_capacity) continue; @@ -1830,6 +1831,15 @@ static int find_lowest_rq_hmp(struct task_struct *task) cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); cpumask_andnot(&candidate_mask, &candidate_mask, cpu_isolated_mask); + /* + * When placement boost is active, if there is no eligible CPU + * in the highest capacity cluster, we fallback to the other + * clusters. So clear the CPUs of the traversed cluster from + * the lowest_mask. + */ + if (unlikely(boost_on_big)) + cpumask_andnot(lowest_mask, lowest_mask, + &cluster->cpus); if (cpumask_empty(&candidate_mask)) continue; @@ -1869,6 +1879,11 @@ static int find_lowest_rq_hmp(struct task_struct *task) break; } + if (unlikely(boost_on_big && best_cpu == -1)) { + boost_on_big = 0; + goto retry; + } + return best_cpu; } #endif /* CONFIG_SCHED_HMP */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 6e053bd9830c..92c3aae8e056 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -72,7 +72,15 @@ static cpumask_t mpc_mask = CPU_MASK_ALL; __read_mostly unsigned int walt_ravg_window = 20000000; /* Min window size (in ns) = 10ms */ +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else #define MIN_SCHED_RAVG_WINDOW 10000000 +#endif /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW 1000000000 diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index e181c87a928d..f56c4da16d0b 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -55,6 +55,8 @@ static inline void walt_migrate_sync_cpu(int cpu) { } static inline void walt_init_cpu_efficiency(void) { } static inline u64 walt_ktime_clock(void) { return 0; } +#define walt_cpu_high_irqload(cpu) false + #endif /* CONFIG_SCHED_WALT */ extern unsigned int walt_disabled; diff --git a/kernel/softirq.c b/kernel/softirq.c index 39ffd41594ce..9029227e5f57 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -234,6 +234,8 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif +#define long_softirq_pending() (local_softirq_pending() & LONG_SOFTIRQ_MASK) +#define defer_for_rt() (long_softirq_pending() && cpupri_check_rt()) asmlinkage __visible void __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; @@ -297,6 +299,7 @@ restart: pending = local_softirq_pending(); if (pending) { if (time_before(jiffies, end) && !need_resched() && + !defer_for_rt() && --max_restart) goto restart; @@ -349,7 +352,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) { + if (!force_irqthreads && !defer_for_rt()) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f27d2ba78d14..8576e6385d63 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2380,9 +2380,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, if (write) { if (*negp) return -EINVAL; + if (*lvalp > UINT_MAX) + return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; + *negp = false; *lvalp = (unsigned long)val; } return 0; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1a4de0022cc5..ceec77c652b5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -848,7 +848,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, * Rate limit to the tick as a hot fix to prevent DOS. Will be * mopped up later. */ - if (ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC) + if (timr->it.alarm.interval.tv64 && + ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC) timr->it.alarm.interval = ktime_set(0, TICK_NSEC); exp = timespec_to_ktime(new_setting->it_value); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ec2102104cb8..333f627a3a3b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -896,6 +896,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 34b2a0d5cf1a..eba904bae48c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3535,7 +3535,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60d246c4eefa..a579a874045b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1362,7 +1362,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; - unsigned *saved_tgids; + unsigned *map_cmdline_to_tgid; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; @@ -1396,9 +1396,10 @@ static int allocate_cmdlines_buffer(unsigned int val, return -ENOMEM; } - s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids), - GFP_KERNEL); - if (!s->saved_tgids) { + s->map_cmdline_to_tgid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_tgid), + GFP_KERNEL); + if (!s->map_cmdline_to_tgid) { kfree(s->map_cmdline_to_pid); kfree(s->saved_cmdlines); return -ENOMEM; @@ -1410,8 +1411,8 @@ static int allocate_cmdlines_buffer(unsigned int val, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - memset(s->saved_tgids, 0, - val * sizeof(*s->saved_tgids)); + memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_tgid)); return 0; } @@ -1577,14 +1578,17 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; + preempt_disable(); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!arch_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) { + preempt_enable(); return 0; + } idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { @@ -1607,8 +1611,9 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - savedcmd->saved_tgids[idx] = tsk->tgid; + savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return 1; } @@ -1650,19 +1655,29 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -int trace_find_tgid(int pid) +static int __find_tgid_locked(int pid) { unsigned map; int tgid; - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - tgid = savedcmd->saved_tgids[map]; + tgid = savedcmd->map_cmdline_to_tgid[map]; else tgid = -1; + return tgid; +} + +int trace_find_tgid(int pid) +{ + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + tgid = __find_tgid_locked(pid); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -3979,10 +3994,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, { char buf[64]; int r; + unsigned int n; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + n = savedcmd->cmdline_num; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + r = scnprintf(buf, sizeof(buf), "%u\n", n); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -3991,7 +4011,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); - kfree(s->saved_tgids); + kfree(s->map_cmdline_to_tgid); kfree(s); } @@ -4008,10 +4028,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -4230,33 +4252,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf, char *file_buf; char *buf; int len = 0; - int pid; int i; + int *pids; + int n = 0; - file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); - buf = file_buf; + pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL); + if (!pids) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + return -ENOMEM; + } for (i = 0; i < savedcmd->cmdline_num; i++) { - int tgid; - int r; + int pid; pid = savedcmd->map_cmdline_to_pid[i]; if (pid == -1 || pid == NO_CMDLINE_MAP) continue; - tgid = trace_find_tgid(pid); - r = sprintf(buf, "%d %d\n", pid, tgid); + pids[n] = pid; + pids[n+1] = __find_tgid_locked(pid); + n += 2; + } + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + if (n == 0) { + kfree(pids); + return 0; + } + + /* enough to hold max pair of pids + space, lr and nul */ + len = n * 12; + file_buf = kmalloc(len, GFP_KERNEL); + if (!file_buf) { + kfree(pids); + return -ENOMEM; + } + + buf = file_buf; + for (i = 0; i < n && len > 0; i += 2) { + int r; + + r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]); buf += r; - len += r; + len -= r; } len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + file_buf, buf - file_buf); kfree(file_buf); + kfree(pids); return len; } @@ -6847,6 +6897,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 12ea4ea619ee..e9092a0247bf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -659,30 +659,25 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } - /* an address specified */ - ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { - pr_info("Failed to parse symbol.\n"); + pr_info("Failed to parse either an address or a symbol.\n"); return ret; } if (offset && is_return) { pr_info("Return probe must be used without offset.\n"); return -EINVAL; } + } else if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; } argc -= 2; argv += 2; |
