diff options
Diffstat (limited to 'kernel')
44 files changed, 2787 insertions, 745 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cbfba78d3db..85de5094b936 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -313,7 +313,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -377,9 +378,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -1758,7 +1769,7 @@ static int do_check(struct verifier_env *env) if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } if (class == BPF_ALU || class == BPF_ALU64) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25cf44889559..077bb52e2d47 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -717,10 +717,10 @@ static void css_set_move_task(struct task_struct *task, if (to_cset) { /* - * We are synchronized through cgroup_threadgroup_rwsem - * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * We are synchronized through css_set_lock against + * PF_EXITING setting such that we can't race against + * cgroup_exit() disassociating the task from the + * css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -5701,19 +5701,22 @@ void cgroup_exit(struct task_struct *tsk) int i; /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * Avoid potential race with the migrate path. + */ + spin_lock_irq(&css_set_lock); + /* + * Unlink from @tsk from its css_set. */ cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } + spin_unlock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ for_each_subsys_which(ss, i, &have_exit_callback) ss->exit(tsk); diff --git a/kernel/cpu.c b/kernel/cpu.c index e822cb0e18d5..1a26ef5b7d58 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -361,6 +361,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; + cpu_hotplug_begin(); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 29c7240172d3..03dbc231a4a0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -174,9 +174,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/events/core.c b/kernel/events/core.c index 95c447e658f7..7fee87daac56 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1693,33 +1693,7 @@ static int __perf_remove_from_context(void *info) return 0; } - -#ifdef CONFIG_SMP -static void perf_retry_remove(struct perf_event *event, - struct remove_event *rep) -{ - int up_ret; - /* - * CPU was offline. Bring it online so we can - * gracefully exit a perf context. - */ - up_ret = cpu_up(event->cpu); - if (!up_ret) - /* Try the remove call once again. */ - cpu_function_call(event->cpu, __perf_remove_from_context, - rep); - else - pr_err("Failed to bring up CPU: %d, ret: %d\n", - event->cpu, up_ret); -} -#else -static void perf_retry_remove(struct perf_event *event, - struct remove_event *rep) -{ -} -#endif - - /* +/* * Remove the event from a task's (or a CPU's) list of events. * * CPU events are removed with a smp call. For task events we only @@ -1754,9 +1728,6 @@ static void __ref perf_remove_from_context(struct perf_event *event, */ ret = cpu_function_call(event->cpu, __perf_remove_from_context, &re); - if (ret == -ENXIO) - perf_retry_remove(event, &re); - return; } @@ -6595,6 +6566,21 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -6642,6 +6628,12 @@ static int __perf_event_overflow(struct perf_event *event, } /* + * For security, drop the skid kernel samples if necessary. + */ + if (!sample_is_allowed(event, regs)) + return ret; + + /* * XXX event_limit might not quite work as expected on inherited * events */ @@ -7117,8 +7109,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .events_across_hotplug = 1, }; #ifdef CONFIG_EVENT_TRACING @@ -7240,8 +7230,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .events_across_hotplug = 1, }; static inline void perf_tp_register(void) @@ -7529,8 +7517,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .events_across_hotplug = 1, }; /* @@ -7612,8 +7598,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .events_across_hotplug = 1, }; static void perf_pmu_nop_void(struct pmu *pmu) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7da5b674d16e..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -614,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .events_across_hotplug = 1, }; int __init init_hw_breakpoint(void) diff --git a/kernel/fork.c b/kernel/fork.c index 622571d2a833..246b8a57a32d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -370,7 +370,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_long(); #endif /* @@ -1596,11 +1596,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1651,6 +1653,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: threadgroup_change_end(current); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9812d9c0d483..e0449956298e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -810,8 +810,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, if (!desc) return; - __irq_do_set_handler(desc, handle, 1, NULL); desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6c8e154c7384..4684b7595e63 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -36,10 +36,32 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = &available_cpus; if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ cpumask_andnot(&available_cpus, cpu_online_mask, cpu_isolated_mask); - if (cpumask_empty(affinity)) + if (cpumask_intersects(&available_cpus, irq_default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + irq_default_affinity); + else if (cpumask_empty(&available_cpus)) affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); ret = true; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e5c70dcb7f8e..2c2effdb4437 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1305,8 +1305,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) + if (ret) { + irq_release_resources(desc); goto out_mask; + } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index cd6009006510..41b40f310c28 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq; + int i, ret, virq = 0; ret = ops->msi_check(domain, info, dev); if (ret == 0) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a24c5b909047..b05509af0352 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, goto free_cpumask; } + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..695763516908 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -563,7 +563,7 @@ static void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static void wait_for_kprobe_optimizer(void) +void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); diff --git a/kernel/kthread.c b/kernel/kthread.c index 850b255649a2..698b8dec3074 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -604,6 +604,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + /* insert @work before @pos in @worker */ static void insert_kthread_work(struct kthread_worker *worker, struct kthread_work *work, @@ -633,7 +646,7 @@ bool queue_kthread_work(struct kthread_worker *worker, unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { insert_kthread_work(worker, work, &worker->work_list); ret = true; } @@ -694,6 +707,87 @@ retry: } EXPORT_SYMBOL_GPL(flush_kthread_work); +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, + unsigned long *flags) +{ + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} + +static bool __kthread_cancel_work_sync(struct kthread_work *work) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + flush_kthread_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + /** * flush_kthread_worker - flush all current works on a kthread_worker * @worker: worker to flush diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..1e6a51cc25c4 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,6 +1,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/osq_lock.h> +#include <linux/sched/rt.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; + struct task_struct *task = current; int curr = encode_cpu(smp_processor_id()); int old; @@ -104,6 +106,32 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * We need to avoid reordering of link updation sequence of osq. + * A case in which the status of optimistic spin queue is + * CPU6->CPU2 in which CPU6 has acquired the lock. At this point + * if CPU0 comes in to acquire osq_lock, it will update the tail + * count. After tail count update if CPU2 starts to unqueue itself + * from optimistic spin queue, it will find updated tail count with + * CPU0 and update CPU2 node->next to NULL in osq_wait_next(). If + * reordering of following stores happen then prev->next where prev + * being CPU2 would be updated to point to CPU0 node: + * node->prev = prev; + * WRITE_ONCE(prev->next, node); + * + * At this point if next instruction + * WRITE_ONCE(next->prev, prev); + * in CPU2 path is committed before the update of CPU0 node->prev = + * prev then CPU0 node->prev will point to CPU6 node. At this point + * if CPU0 path's node->prev = prev is committed resulting in change + * of CPU0 prev back to CPU2 node. CPU2 node->next is NULL, so if + * CPU0 gets into unqueue path of osq_lock it will keep spinning + * in infinite loop as condition prev->next == node will never be + * true. + */ + smp_mb(); + WRITE_ONCE(prev->next, node); /* @@ -118,8 +146,13 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * If a task spins on owner on a CPU after acquiring + * osq_lock while a RT task spins on another CPU to + * acquire osq_lock, it will starve the owner from + * completing if owner is to be scheduled on the same CPU. + * It will be a live lock. */ - if (need_resched()) + if (need_resched() || rt_task(task)) goto unqueue; cpu_relax_lowlatency(); diff --git a/kernel/padata.c b/kernel/padata.c index 401227e3967c..ecc7b3f452c7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -357,7 +357,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd, cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.cbcpu); + free_cpumask_var(pd->cpumask.pcpu); return -ENOMEM; } diff --git a/kernel/panic.c b/kernel/panic.c index 982a52352cfc..679254405510 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -172,7 +172,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - pr_emerg("Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..\n", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..567ecc826bc8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -255,7 +255,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c7e8ed99c953..5e2cd1030702 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,19 +28,25 @@ #include <linux/compat.h> +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -353,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -420,7 +426,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 308f80ce2e43..a353df46c8e4 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,4 +22,6 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d2b8834dd3b..0071785e698b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2623,9 +2623,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - rq = __task_rq_lock(p); mark_task_starting(p); + post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -3154,9 +3154,10 @@ unsigned long sum_capacity_reqs(unsigned long cfs_cap, return total += scr->dl; } +unsigned long boosted_cpu_util(int cpu); static void sched_freq_tick_pelt(int cpu) { - unsigned long cpu_utilization = capacity_max; + unsigned long cpu_utilization = boosted_cpu_util(cpu); unsigned long capacity_curr = capacity_curr_of(cpu); struct sched_capacity_reqs *scr; @@ -6460,9 +6461,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); return -1; } @@ -6555,8 +6553,12 @@ static inline bool sched_debug(void) static int sd_degenerate(struct sched_domain *sd) { - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; + if (cpumask_weight(sched_domain_span(sd)) == 1) { + if (sd->groups->sge) + sd->flags &= ~SD_LOAD_BALANCE; + else + return 1; + } /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | @@ -6564,6 +6566,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES)) { @@ -6595,11 +6598,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES); + if (parent->groups->sge) { + parent->flags &= ~SD_LOAD_BALANCE; + return 0; + } if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6680,6 +6688,9 @@ static int init_rootdomain(struct root_domain *rd) goto free_rto_mask; init_max_cpu_capacity(&rd->max_cpu_capacity); + + rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1; + return 0; free_rto_mask: @@ -6996,6 +7007,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -7291,11 +7303,19 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_SHARE_CAP_STATES - describes shared capacity states + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * SD_SHARE_CAP_STATES - describes shared capacity states + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -7305,11 +7325,13 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN | \ SD_SHARE_CAP_STATES) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -7361,6 +7383,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -7370,6 +7393,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; @@ -7816,16 +7846,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -7859,7 +7886,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7877,8 +7903,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; } } @@ -7914,8 +7938,19 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); + int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu); + + if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig > + cpu_rq(max_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); + + if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig < + cpu_rq(min_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->min_cap_orig_cpu, i); + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); @@ -8339,6 +8374,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 0b5f2dea18a1..ce15ae7fe76b 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -39,11 +39,13 @@ struct cluster_data { cpumask_t cpu_mask; unsigned int need_cpus; unsigned int task_thres; + unsigned int max_nr; s64 need_ts; struct list_head lru; bool pending; spinlock_t pending_lock; bool is_big_cluster; + bool enable; int nrrun; bool nrrun_changed; struct task_struct *core_ctl_thread; @@ -60,6 +62,7 @@ struct cpu_data { struct cluster_data *cluster; struct list_head sib; bool isolated_by_us; + unsigned int max_nr; }; static DEFINE_PER_CPU(struct cpu_data, cpu_state); @@ -244,6 +247,29 @@ static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); } +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) { return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); @@ -372,6 +398,7 @@ core_ctl_attr_ro(need_cpus); core_ctl_attr_ro(active_cpus); core_ctl_attr_ro(global_state); core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); static struct attribute *default_attrs[] = { &min_cpus.attr, @@ -381,6 +408,7 @@ static struct attribute *default_attrs[] = { &busy_down_thres.attr, &task_thres.attr, &is_big_cluster.attr, + &enable.attr, &need_cpus.attr, &active_cpus.attr, &global_state.attr, @@ -429,7 +457,6 @@ static struct kobj_type ktype_core_ctl = { #define RQ_AVG_TOLERANCE 2 #define RQ_AVG_DEFAULT_MS 20 -#define NR_RUNNING_TOLERANCE 5 static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; static s64 rq_avg_timestamp_ms; @@ -437,6 +464,7 @@ static s64 rq_avg_timestamp_ms; static void update_running_avg(bool trigger_update) { int avg, iowait_avg, big_avg, old_nrrun; + int old_max_nr, max_nr, big_max_nr; s64 now; unsigned long flags; struct cluster_data *cluster; @@ -450,40 +478,23 @@ static void update_running_avg(bool trigger_update) return; } rq_avg_timestamp_ms = now; - sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg, + &max_nr, &big_max_nr); spin_unlock_irqrestore(&state_lock, flags); - /* - * Round up to the next integer if the average nr running tasks - * is within NR_RUNNING_TOLERANCE/100 of the next integer. - * If normal rounding up is used, it will allow a transient task - * to trigger online event. By the time core is onlined, the task - * has finished. - * Rounding to closest suffers same problem because scheduler - * might only provide running stats per jiffy, and a transient - * task could skew the number for one jiffy. If core control - * samples every 2 jiffies, it will observe 0.5 additional running - * average which rounds up to 1 task. - */ - avg = (avg + NR_RUNNING_TOLERANCE) / 100; - big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; - for_each_cluster(cluster, index) { if (!cluster->inited) continue; + old_nrrun = cluster->nrrun; - /* - * Big cluster only need to take care of big tasks, but if - * there are not enough big cores, big tasks need to be run - * on little as well. Thus for little's runqueue stat, it - * has to use overall runqueue average, or derive what big - * tasks would have to be run on little. The latter approach - * is not easy to get given core control reacts much slower - * than scheduler, and can't predict scheduler's behavior. - */ + old_max_nr = cluster->max_nr; cluster->nrrun = cluster->is_big_cluster ? big_avg : avg; - if (cluster->nrrun != old_nrrun) { + cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr; + + if (cluster->nrrun != old_nrrun || + cluster->max_nr != old_max_nr) { + if (trigger_update) apply_need(cluster); else @@ -493,6 +504,7 @@ static void update_running_avg(bool trigger_update) return; } +#define MAX_NR_THRESHOLD 4 /* adjust needed CPUs based on current runqueue information */ static unsigned int apply_task_need(const struct cluster_data *cluster, unsigned int new_need) @@ -503,7 +515,15 @@ static unsigned int apply_task_need(const struct cluster_data *cluster, /* only unisolate more cores if there are tasks to run */ if (cluster->nrrun > new_need) - return new_need + 1; + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; return new_need; } @@ -549,7 +569,7 @@ static bool eval_need(struct cluster_data *cluster) spin_lock_irqsave(&state_lock, flags); - if (cluster->boost) { + if (cluster->boost || !cluster->enable) { need_cpus = cluster->max_cpus; } else { cluster->active_cpus = get_active_cpu_count(cluster); @@ -1046,6 +1066,7 @@ static int cluster_init(const struct cpumask *mask) cluster->offline_delay_ms = 100; cluster->task_thres = UINT_MAX; cluster->nrrun = cluster->num_cpus; + cluster->enable = true; INIT_LIST_HEAD(&cluster->lru); spin_lock_init(&cluster->pending_lock); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..dbc51442ecbc --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,63 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * @func: Callback function to set for the CPU. + * + * Set and publish the update_util_data pointer for the given CPU. + * + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. + */ +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)) +{ + if (WARN_ON(!data || !func)) + return; + + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index d751bc2d0d6e..f10d9f7d6d07 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -32,6 +32,12 @@ static struct cpufreq_governor cpufreq_gov_sched; static DEFINE_PER_CPU(unsigned long, enabled); DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +struct gov_tunables { + struct gov_attr_set attr_set; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; +}; + /** * gov_data - per-policy data internal to the governor * @up_throttle: next throttling period expiry if increasing OPP @@ -53,8 +59,8 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); struct gov_data { ktime_t up_throttle; ktime_t down_throttle; - unsigned int up_throttle_nsec; - unsigned int down_throttle_nsec; + struct gov_tunables *tunables; + struct list_head tunables_hook; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; @@ -71,8 +77,10 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); - gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), + gd->tunables->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), + gd->tunables->down_throttle_nsec); up_write(&policy->rwsem); } @@ -262,12 +270,70 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } -static struct attribute_group sched_attr_group_gov_pol; -static struct attribute_group *get_sysfs_attr(void) +/* Tunables */ +static struct gov_tunables *global_tunables; + +static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set) { - return &sched_attr_group_gov_pol; + return container_of(attr_set, struct gov_tunables, attr_set); } +static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->up_throttle_nsec); +} + +static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + tunables->up_throttle_nsec = val; + return count; +} + +static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_throttle_nsec); +} + +static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + tunables->down_throttle_nsec = val; + return count; +} + +static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec); +static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec); + +static struct attribute *schedfreq_attributes[] = { + &up_throttle_nsec.attr, + &down_throttle_nsec.attr, + NULL +}; + +static struct kobj_type tunables_ktype = { + .default_attrs = schedfreq_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; @@ -282,17 +348,39 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? - policy->cpuinfo.transition_latency : - THROTTLE_UP_NSEC; - gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; - pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->up_throttle_nsec); - - rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); - if (rc) { - pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); - goto err; + policy->governor_data = gd; + + if (!global_tunables) { + gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL); + if (!gd->tunables) + goto free_gd; + + gd->tunables->up_throttle_nsec = + policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_UP_NSEC; + gd->tunables->down_throttle_nsec = + THROTTLE_DOWN_NSEC; + + rc = kobject_init_and_add(&gd->tunables->attr_set.kobj, + &tunables_ktype, + get_governor_parent_kobj(policy), + "%s", cpufreq_gov_sched.name); + if (rc) + goto free_tunables; + + gov_attr_set_init(&gd->tunables->attr_set, + &gd->tunables_hook); + + pr_debug("%s: throttle_threshold = %u [ns]\n", + __func__, gd->tunables->up_throttle_nsec); + + if (!have_governor_per_policy()) + global_tunables = gd->tunables; + } else { + gd->tunables = global_tunables; + gov_attr_set_get(&global_tunables->attr_set, + &gd->tunables_hook); } policy->governor_data = gd; @@ -304,7 +392,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (IS_ERR_OR_NULL(gd->task)) { pr_err("%s: failed to create kschedfreq thread\n", __func__); - goto err; + goto free_tunables; } get_task_struct(gd->task); kthread_bind_mask(gd->task, policy->related_cpus); @@ -316,7 +404,9 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) return 0; -err: +free_tunables: + kfree(gd->tunables); +free_gd: policy->governor_data = NULL; kfree(gd); return -ENOMEM; @@ -324,6 +414,7 @@ err: static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) { + unsigned int count; struct gov_data *gd = policy->governor_data; clear_sched_freq(); @@ -332,7 +423,12 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(&policy->kobj, get_sysfs_attr()); + count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook); + if (!count) { + if (!have_governor_per_policy()) + global_tunables = NULL; + kfree(gd->tunables); + } policy->governor_data = NULL; @@ -394,88 +490,6 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, return 0; } -/* Tunables */ -static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->up_throttle_nsec); -} - -static ssize_t store_up_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->up_throttle_nsec = val; - return count; -} - -static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->down_throttle_nsec); -} - -static ssize_t store_down_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->down_throttle_nsec = val; - return count; -} - -/* - * Create show/store routines - * - sys: One governor instance for complete SYSTEM - * - pol: One governor instance per struct cpufreq_policy - */ -#define show_gov_pol_sys(file_name) \ -static ssize_t show_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, char *buf) \ -{ \ - return show_##file_name(policy->governor_data, buf); \ -} - -#define store_gov_pol_sys(file_name) \ -static ssize_t store_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, const char *buf, size_t count) \ -{ \ - return store_##file_name(policy->governor_data, buf, count); \ -} - -#define gov_pol_attr_rw(_name) \ - static struct freq_attr _name##_gov_pol = \ - __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) - -#define show_store_gov_pol_sys(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name) -#define tunable_handlers(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name); \ - gov_pol_attr_rw(file_name) - -tunable_handlers(down_throttle_nsec); -tunable_handlers(up_throttle_nsec); - -/* Per policy governor instance */ -static struct attribute *sched_attributes_gov_pol[] = { - &up_throttle_nsec_gov_pol.attr, - &down_throttle_nsec_gov_pol.attr, - NULL, -}; - -static struct attribute_group sched_attr_group_gov_pol = { - .attrs = sched_attributes_gov_pol, - .name = "sched", -}; #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..75bfbb336722 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,770 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <trace/events/power.h> + +#include "sched.h" +#include "tune.h" + +unsigned long boosted_cpu_util(int cpu); + +/* Stub out fast switch routines present on mainline to reduce the backport + * overhead. */ +#define cpufreq_driver_fast_switch(x, y) 0 +#define cpufreq_enable_fast_switch(x) +#define cpufreq_disable_fast_switch(x) +#define LATENCY_MULTIPLIER (1000) +#define SUGOV_KTHREAD_PRIORITY 50 + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int up_rate_limit_us; + unsigned int down_rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 min_rate_limit_ns; + s64 up_rate_delay_ns; + s64 down_rate_delay_ns; + unsigned int next_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + unsigned int cached_raw_freq; + unsigned long iowait_boost; + unsigned long iowait_boost_max; + u64 last_update; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + unsigned int flags; +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + + /* No need to recalculate next freq for min_rate_limit_us at least */ + return delta_ns >= sg_policy->min_rate_limit_ns; +} + +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + s64 delta_ns; + + delta_ns = time - sg_policy->last_freq_update_time; + + if (next_freq > sg_policy->next_freq && + delta_ns < sg_policy->up_rate_delay_ns) + return true; + + if (next_freq < sg_policy->next_freq && + delta_ns < sg_policy->down_rate_delay_ns) + return true; + + return false; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) + return; + + if (policy->fast_switch_enabled) { + if (sg_policy->next_freq == next_freq) { + trace_cpu_frequency(policy->cur, smp_processor_id()); + return; + } + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else if (sg_policy->next_freq != next_freq) { + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. + */ +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); +} + +static inline bool use_pelt(void) +{ +#ifdef CONFIG_SCHED_WALT + return (!sysctl_sched_use_walt_cpu_util || walt_disabled); +#else + return true; +#endif +} + +static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + unsigned long max_cap, rt; + s64 delta; + + max_cap = arch_scale_cpu_capacity(NULL, cpu); + + sched_avg_update(rq); + delta = time - rq->age_stamp; + if (unlikely(delta < 0)) + delta = 0; + rt = div64_u64(rq->rt_avg, sched_avg_period() + delta); + rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT; + + *util = boosted_cpu_util(cpu); + if (likely(use_pelt())) + *util = min((*util + rt), max_cap); + + *max = max_cap; +} + +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + if (flags & SCHED_CPUFREQ_IOWAIT) { + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) + sg_cpu->iowait_boost = 0; + } +} + +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, + unsigned long *max) +{ + unsigned long boost_util = sg_cpu->iowait_boost; + unsigned long boost_max = sg_cpu->iowait_boost_max; + + if (!boost_util) + return; + + if (*util * boost_max < *max * boost_util) { + *util = boost_util; + *max = boost_max; + } + sg_cpu->iowait_boost >>= 1; +} + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util, max; + unsigned int next_f; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + if (flags & SCHED_CPUFREQ_DL) { + next_f = policy->cpuinfo.max_freq; + } else { + sugov_get_util(&util, &max, time); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_cpu, util, max); + } + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, + unsigned long util, unsigned long max, + unsigned int flags) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int max_f = policy->cpuinfo.max_freq; + u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned int j; + + if (flags & SCHED_CPUFREQ_DL) + return max_f; + + sugov_iowait_boost(sg_cpu, &util, &max); + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu; + unsigned long j_util, j_max; + s64 delta_ns; + + if (j == smp_processor_id()) + continue; + + j_sg_cpu = &per_cpu(sugov_cpu, j); + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now (and clear iowait_boost for it). + */ + delta_ns = last_freq_update_time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) { + j_sg_cpu->iowait_boost = 0; + continue; + } + if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) + return max_f; + + j_util = j_sg_cpu->util; + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + + sugov_iowait_boost(j_sg_cpu, &util, &max); + } + + return get_next_freq(sg_cpu, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long util, max; + unsigned int next_f; + + sugov_get_util(&util, &max, time); + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->flags = flags; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct kthread_work *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + + /* + * For Real Time and Deadline tasks, schedutil governor shoots the + * frequency to maximum. And special care must be taken to ensure that + * this kthread doesn't result in that. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() and before that schedutil + * rejects all other frequency scaling requests. + * + * Though there is a very rare case where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + queue_kthread_work(&sg_policy->worker, &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static DEFINE_MUTEX(min_rate_lock); + +static void update_min_rate_limit_us(struct sugov_policy *sg_policy) +{ + mutex_lock(&min_rate_lock); + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns, + sg_policy->down_rate_delay_ns); + mutex_unlock(&min_rate_lock); +} + +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->up_rate_limit_us); +} + +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_rate_limit_us); +} + +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->up_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } + + return count; +} + +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->down_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } + + return count; +} + +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &up_rate_limit_us.attr, + &down_rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + init_kthread_work(&sg_policy->work, sugov_work); + init_kthread_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + flush_kthread_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + unsigned int lat; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) + return -ENOMEM; + + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto stop_kthread; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto stop_kthread; + } + + tunables->up_rate_limit_us = LATENCY_MULTIPLIER; + tunables->down_rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) { + tunables->up_rate_limit_us *= lat; + tunables->down_rate_limit_us *= lat; + } + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + cpufreq_gov_schedutil.name); + if (ret) + goto fail; + + out: + mutex_unlock(&global_tunables_lock); + + cpufreq_enable_fast_switch(policy); + return 0; + + fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static int sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + cpufreq_disable_fast_switch(policy); + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_kthread_stop(sg_policy); + sugov_policy_free(sg_policy); + + return 0; +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->up_rate_delay_ns = + sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC; + sg_policy->down_rate_delay_ns = + sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + sg_cpu->sg_policy = sg_policy; + if (policy_is_shared(policy)) { + sg_cpu->util = 0; + sg_cpu->max = 0; + sg_cpu->flags = SCHED_CPUFREQ_DL; + sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_shared); + } else { + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_single); + } + } + return 0; +} + +static int sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + + return 0; +} + +static int sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + cpufreq_policy_apply_limits(policy); + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; + + return 0; +} + +static int cpufreq_schedutil_cb(struct cpufreq_policy *policy, + unsigned int event) +{ + switch(event) { + case CPUFREQ_GOV_POLICY_INIT: + return sugov_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return sugov_exit(policy); + case CPUFREQ_GOV_START: + return sugov_start(policy); + case CPUFREQ_GOV_STOP: + return sugov_stop(policy); + case CPUFREQ_GOV_LIMITS: + return sugov_limits(policy); + default: + BUG(); + } +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil = { + .name = "schedutil", + .governor = cpufreq_schedutil_cb, + .owner = THIS_MODULE, +}; + +static int __init sugov_register(void) +{ + return cpufreq_register_governor(&cpufreq_gov_schedutil); +} +fs_initcall(sugov_register); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..1d00cf8c00fa 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -27,6 +27,8 @@ * of the License. */ +#include "sched.h" + #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -51,6 +53,27 @@ static int convert_prio(int prio) } /** + * drop_nopreempt_cpus - remove a cpu from the mask if it is likely + * non-preemptible + * @lowest_mask: mask with selected CPUs (non-NULL) + */ +static void +drop_nopreempt_cpus(struct cpumask *lowest_mask) +{ + unsigned int cpu = cpumask_first(lowest_mask); + + while (cpu < nr_cpu_ids) { + /* unlocked access */ + struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr); + + if (task_may_not_preempt(task, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + + cpu = cpumask_next(cpu, lowest_mask); + } +} + +/** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task @@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, { int idx = 0; int task_pri = convert_prio(p->prio); + bool drop_nopreempts = task_pri <= MAX_RT_PRIO; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); +retry: for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; int skip = 0; @@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); - + if (drop_nopreempts) + drop_nopreempt_cpus(lowest_mask); /* * We have to ensure that we have at least one bit * still set in the array, since the map could have @@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, return 1; } - + /* + * If we can't find any non-preemptible cpu's, retry so we can + * find the lowest priority target and avoid priority inversion. + */ + if (drop_nopreempts) { + drop_nopreempts = false; + goto retry; + } return 0; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3d55ec89c400..a105e97ab6bf 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -755,6 +755,9 @@ static void update_curr_dl(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c8c4272c61d8..ed8e6bb4531b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -636,6 +636,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_affine_attempts); P(se.statistics.nr_wakeups_passive); P(se.statistics.nr_wakeups_idle); + /* eas */ + /* select_idle_sibling() */ + P(se.statistics.nr_wakeups_sis_attempts); + P(se.statistics.nr_wakeups_sis_idle); + P(se.statistics.nr_wakeups_sis_cache_affine); + P(se.statistics.nr_wakeups_sis_suff_cap); + P(se.statistics.nr_wakeups_sis_idle_cpu); + P(se.statistics.nr_wakeups_sis_count); + /* select_energy_cpu_brute() */ + P(se.statistics.nr_wakeups_secb_attempts); + P(se.statistics.nr_wakeups_secb_sync); + P(se.statistics.nr_wakeups_secb_idle_bt); + P(se.statistics.nr_wakeups_secb_insuff_cap); + P(se.statistics.nr_wakeups_secb_no_nrg_sav); + P(se.statistics.nr_wakeups_secb_nrg_sav); + P(se.statistics.nr_wakeups_secb_count); + /* find_best_target() */ + P(se.statistics.nr_wakeups_fbt_attempts); + P(se.statistics.nr_wakeups_fbt_no_cpu); + P(se.statistics.nr_wakeups_fbt_no_sd); + P(se.statistics.nr_wakeups_fbt_pref_idle); + P(se.statistics.nr_wakeups_fbt_count); + /* cas */ + /* select_task_rq_fair() */ + P(se.statistics.nr_wakeups_cas_attempts); + P(se.statistics.nr_wakeups_cas_count); #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) __P(load_avg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4d96380b35e8..422438d43d90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -50,7 +50,6 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; -unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; @@ -119,6 +118,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -294,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -664,7 +709,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -688,20 +733,115 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = sched_freq() ? - sysctl_sched_initial_task_util : - scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = sched_freq() ? + * sysctl_sched_initial_task_util : + * scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static void attach_entity_cfs_rq(struct sched_entity *se); + +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + return; + } + } + + attach_entity_cfs_rq(se); +} + #else void init_entity_runnable_average(struct sched_entity *se) { } -#endif +void post_init_entity_util_avg(struct sched_entity *se) +{ +} +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1425,7 +1565,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: assigned = true; @@ -2410,28 +2551,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; + load = scale_load_down(cfs_rq->load.weight); - return tg_weight; -} - -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2450,6 +2585,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -2468,16 +2604,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2486,8 +2626,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3240,7 +3381,8 @@ retry: sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED; } } else if (stats.best_cpu >= 0) { - if (stats.best_cpu != task_cpu(p) && + if (stats.best_sibling_cpu >= 0 && + stats.best_cpu != task_cpu(p) && stats.min_cost == stats.best_sibling_cpu_cost) { stats.best_cpu = stats.best_sibling_cpu; sbc_flag |= SBC_FLAG_BEST_SIBLING; @@ -3536,6 +3678,16 @@ kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) static DEFINE_RAW_SPINLOCK(migration_lock); +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + /* * Check if currently running task should be migrated to a better cpu. * @@ -3553,7 +3705,7 @@ void check_for_migration(struct rq *rq, struct task_struct *p) raw_spin_lock(&migration_lock); new_cpu = select_best_cpu(p, cpu, reason, 0); - if (new_cpu != cpu) { + if (do_migration(reason, new_cpu, cpu)) { active_balance = kick_active_balance(rq, p, new_cpu); if (active_balance) mark_reserved(new_cpu); @@ -3779,25 +3931,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } -#ifdef CONFIG_FAIR_GROUP_SCHED /* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_FAIR_GROUP_SCHED +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* @@ -3817,23 +4206,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -3848,68 +4257,89 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq == &rq_of(cfs_rq)->cfs) trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + return decayed || removed; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); int cpu = cpu_of(rq_of(cfs_rq)); + int decayed; + void *ptr = NULL; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); - if (entity_is_task(se)) - trace_sched_load_avg_task(task_of(se), &se->avg); + if (entity_is_task(se)) { +#ifdef CONFIG_SCHED_WALT + ptr = (void *)&(task_of(se)->ravg); +#endif + trace_sched_load_avg_task(task_of(se), &se->avg, ptr); + } } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -3917,34 +4347,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3973,13 +4389,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * Newly created task or never used group entity should not be removed @@ -3988,9 +4416,7 @@ void remove_entity_load_avg(struct sched_entity *se) if (se->avg.last_update_time == 0) return; - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -4027,7 +4453,16 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -4176,9 +4611,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -4251,6 +4687,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -4286,7 +4732,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } /* @@ -4341,7 +4787,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -4457,8 +4903,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); - update_cfs_shares(cfs_rq); + update_load_avg(curr, UPDATE_TG); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5102,6 +5548,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -5342,7 +5808,7 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); -static inline unsigned long boosted_cpu_util(int cpu); +unsigned long boosted_cpu_util(int cpu); #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -5379,6 +5845,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_wakeup = flags & ENQUEUE_WAKEUP; #endif + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -5407,8 +5881,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) { @@ -5492,15 +5966,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -5514,8 +5987,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) { @@ -6121,15 +6594,7 @@ static int sched_group_energy(struct energy_env *eenv) */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); - if (!sd) - /* - * We most probably raced with hotplug; returning a - * wrong energy estimation is better than entering an - * infinite loop. - */ - return -EINVAL; - - if (sd->parent) + if (sd && sd->parent) sg_shared_cap = sd->parent->groups; for_each_domain(cpu, sd) { @@ -6184,6 +6649,14 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } + + /* + * If we raced with hotplug and got an sd NULL-pointer; + * returning a wrong energy estimation is better than + * entering an infinite loop. + */ + if (cpumask_test_cpu(cpu, &visit_cpus)) + return -EINVAL; next_cpu: cpumask_clear_cpu(cpu, &visit_cpus); continue; @@ -6210,6 +6683,7 @@ static inline int __energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; struct energy_env eenv_before = { .util_delta = 0, @@ -6252,12 +6726,22 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - +#ifndef CONFIG_SCHED_TUNE trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); +#endif + /* + * Dead-zone margin preventing too many migrations. + */ + + margin = eenv->nrg.before >> 6; /* ~1.56% */ + + diff = eenv->nrg.after - eenv->nrg.before; + + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; return eenv->nrg.diff; } @@ -6265,30 +6749,37 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; - +extern bool schedtune_initialized; /* * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], * corresponding to the specified energy variation. */ static inline int normalize_energy(int energy_diff) { u32 normalized_nrg; + + /* during early setup, we don't know the extents */ + if (unlikely(!schedtune_initialized)) + return energy_diff < 0 ? -1 : 1 ; + #ifdef CONFIG_SCHED_DEBUG + { int max_delta; /* Check for boundaries */ max_delta = schedtune_target_nrg.max_power; max_delta -= schedtune_target_nrg.min_power; WARN_ON(abs(energy_diff) >= max_delta); + } #endif /* Do scaling using positive numbers to increase the range */ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; + normalized_nrg <<= SCHED_CAPACITY_SHIFT; /* Normalize on max energy for target platform */ normalized_nrg = reciprocal_divide( @@ -6319,6 +6810,12 @@ energy_diff(struct energy_env *eenv) eenv->cap.delta, eenv->task); + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + /* * When SchedTune is enabled, the energy_diff() function will return * the computed energy payoff value. Since the energy_diff() return @@ -6358,18 +6855,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -6429,8 +6926,6 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -unsigned int capacity_margin = 1280; /* ~20% margin */ - static inline unsigned long boosted_task_util(struct task_struct *task); static inline bool __task_fits(struct task_struct *p, int cpu, int util) @@ -6456,11 +6951,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static inline bool task_fits_spare(struct task_struct *p, int cpu) -{ - return __task_fits(p, cpu, cpu_util(cpu)); -} - static bool cpu_overutilized(int cpu) { return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); @@ -6468,6 +6958,8 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE +struct reciprocal_value schedtune_spc_rdiv; + static long schedtune_margin(unsigned long signal, long boost) { @@ -6478,29 +6970,16 @@ schedtune_margin(unsigned long signal, long boost) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S), if B is positive - * M = B * S, if B is negative + * M = B * (SCHED_CAPACITY_SCALE - S) * The obtained M could be used by the caller to "boost" S. */ if (boost >= 0) { - margin = SCHED_LOAD_SCALE - signal; + margin = SCHED_CAPACITY_SCALE - signal; margin *= boost; } else margin = -signal * boost; - /* - * Fast integer division by constant: - * Constant : (C) = 100 - * Precision : 0.1% (P) = 0.1 - * Reference : C * 100 / P (R) = 100000 - * - * Thus: - * Shift bits : ceil(log(R,2)) (S) = 17 - * Mult const : round(2^S/C) (M) = 1311 - * - * - */ - margin *= 1311; - margin >>= 17; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); if (boost < 0) margin *= -1; @@ -6550,7 +7029,7 @@ schedtune_task_margin(struct task_struct *task) #endif /* CONFIG_SCHED_TUNE */ -static inline unsigned long +unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); @@ -6572,6 +7051,13 @@ boosted_task_util(struct task_struct *task) return util + margin; } +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6581,10 +7067,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL, *spare_group = NULL; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; - unsigned long fit_capacity = ULONG_MAX; - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -6592,7 +7077,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_capacity; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -6604,8 +7089,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -6616,24 +7105,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; - /* - * Look for most energy-efficient group that can fit - * that can fit the task. - */ - if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { - fit_capacity = capacity_of(i); - fit_group = group; - } + spare_cap = capacity_spare_wake(i, p); - /* - * Look for group which has most spare capacity on a - * single cpu. - */ - spare_capacity = capacity_of(i) - cpu_util(i); - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_group = group; - } + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -6641,17 +7116,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (fit_group) - return fit_group; - - if (spare_group) - return spare_group; + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -6671,9 +7161,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (task_fits_spare(p, i)) { + if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6685,8 +7179,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (idle_cpu(i) && - (!idle || idle->exit_latency == min_exit_latency) && + } else if ((!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -6695,13 +7188,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (shallowest_idle_cpu == -1) { - /* - * If we haven't found an idle CPU yet - * pick a non-idle one that can fit the task as - * fallback. - */ - shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -6718,24 +7204,32 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); - int best_idle = -1; - int best_idle_cstate = -1; - int best_idle_capacity = INT_MAX; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; + + schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts); + schedstat_inc(this_rq(), eas_stats.sis_attempts); if (!sysctl_sched_cstate_aware) { - if (idle_cpu(target)) + if (idle_cpu(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle); + schedstat_inc(this_rq(), eas_stats.sis_idle); return target; + } /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine); + schedstat_inc(this_rq(), eas_stats.sis_cache_affine); + return prev; + } } if (!(current->flags & PF_WAKE_UP_IDLE) && @@ -6749,24 +7243,30 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; if (sysctl_sched_cstate_aware) { for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + int idle_idx = idle_get_state_idx(cpu_rq(i)); unsigned long new_usage = boosted_task_util(p); unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) goto next; - if (i == target && new_usage <= capacity_curr_of(target)) + if (i == target && new_usage <= capacity_curr_of(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap); + schedstat_inc(this_rq(), eas_stats.sis_suff_cap); + schedstat_inc(sd, eas_stats.sis_suff_cap); return target; + } - if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { - best_idle = i; + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; best_idle_cstate = idle_idx; best_idle_capacity = capacity_orig; } @@ -6779,231 +7279,283 @@ static int select_idle_sibling(struct task_struct *p, int target) target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu); + schedstat_inc(this_rq(), eas_stats.sis_idle_cpu); + schedstat_inc(sd, eas_stats.sis_idle_cpu); goto done; } next: sg = sg->next; } while (sg != sd->groups); } - if (best_idle > 0) - target = best_idle; + + if (best_idle_cpu >= 0) + target = best_idle_cpu; done: + schedstat_inc(p, se.statistics.nr_wakeups_sis_count); + schedstat_inc(this_rq(), eas_stats.sis_count); + return target; } -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) { - int iter_cpu; - int target_cpu = -1; - int target_util = 0; - int backup_capacity = 0; - int best_idle_cpu = -1; - int best_idle_cstate = INT_MAX; - int backup_cpu = -1; - unsigned long task_util_boosted, new_util; - - task_util_boosted = boosted_task_util(p); - for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { - int cur_capacity; - struct rq *rq; - int idle_idx; - - /* - * Iterate from higher cpus for boosted tasks. - */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + unsigned long util, capacity; - if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) - continue; - - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); - /* - * Unconditionally favoring tasks that prefer idle cpus to - * improve latency. - */ - if (idle_cpu(i) && prefer_idle) { - if (best_idle_cpu < 0) - best_idle_cpu = i; - continue; - } + return (util >= capacity) ? capacity : util; +} - cur_capacity = capacity_curr_of(i); - rq = cpu_rq(i); - idle_idx = idle_get_state_idx(rq); +static int start_cpu(bool boosted) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - if (prefer_idle) { - /* Find a target cpu with highest - * utilization. - */ - if (target_util == 0 || - target_util < new_util) { - target_cpu = i; - target_util = new_util; - } - } else { - /* Find a target cpu with lowest - * utilization. - */ - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; - } - } - } else if (!prefer_idle) { - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - best_idle_cpu = i; - } - } - } else if (backup_capacity == 0 || - backup_capacity > cur_capacity) { - // Find a backup cpu with least capacity. - backup_capacity = cur_capacity; - backup_cpu = i; - } - } + RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(), + "sched RCU must be held"); - if (prefer_idle && best_idle_cpu >= 0) - target_cpu = best_idle_cpu; - else if (target_cpu < 0) - target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; - - return target_cpu; + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } -static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { + int target_cpu = -1; + unsigned long target_util = prefer_idle ? ULONG_MAX : 0; + unsigned long backup_capacity = ULONG_MAX; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long min_util = boosted_task_util(p); struct sched_domain *sd; - struct sched_group *sg, *sg_target; - int target_max_cap = INT_MAX; - int target_cpu = task_cpu(p); - unsigned long task_util_boosted, new_util; - int i; + struct sched_group *sg; + int cpu = start_cpu(boosted); - if (sysctl_sched_sync_hint_enable && sync) { - int cpu = smp_processor_id(); - cpumask_t search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); - if (cpumask_test_cpu(cpu, &search_cpus)) - return cpu; + schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); + schedstat_inc(this_rq(), eas_stats.fbt_attempts); + + if (cpu < 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); + schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); + return target_cpu; } - sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!sd) - return target; + if (!sd) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); + schedstat_inc(this_rq(), eas_stats.fbt_no_sd); + return target_cpu; + } sg = sd->groups; - sg_target = sg; - if (sysctl_sched_is_big_little) { + do { + int i; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long cur_capacity, new_util, wake_util; + unsigned long min_wake_util = ULONG_MAX; - /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. - */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + if (!cpu_online(i)) + continue; - task_util_boosted = boosted_task_util(p); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - new_util = cpu_util(i) + task_util_boosted; + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); /* * Ensure minimum capacity to grant the required boost. * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ + new_util = max(min_util, new_util); + if (new_util > capacity_orig_of(i)) continue; - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); + return i; + } + + cur_capacity = capacity_curr_of(i); + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + /* + * Find a target cpu with the lowest/highest + * utilization if prefer_idle/!prefer_idle. + */ + if (prefer_idle) { + /* Favor the CPU that last ran the task */ + if (new_util > target_util || + wake_util > min_wake_util) + continue; + min_wake_util = wake_util; + target_util = new_util; + target_cpu = i; + } else if (target_util < new_util) { + target_util = new_util; + target_cpu = i; + } + } else if (!prefer_idle) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + if (best_idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + best_idle_cpu = i; + } + } + } else if (backup_capacity > cur_capacity) { + /* Find a backup cpu with least capacity. */ + backup_capacity = cur_capacity; + backup_cpu = i; } + } + } while (sg = sg->next, sg != sd->groups); + + if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; + + if (target_cpu >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); + } + + return target_cpu; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) +{ + struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target; + bool boosted, prefer_idle; + + schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); + schedstat_inc(this_rq(), eas_stats.secb_attempts); - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_sync); + schedstat_inc(this_rq(), eas_stats.secb_sync); + return cpu; } - } else { - /* - * Find a cpu with sufficient capacity - */ + } + + rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; #else - bool boosted = 0; - bool prefer_idle = 0; + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted, prefer_idle); - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) - return target_cpu; + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + /* Find a cpu with sufficient capacity */ + tmp_target = find_best_target(p, boosted, prefer_idle); + + if (!sd) + goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + goto unlock; } } - if (target_cpu != task_cpu(p)) { + if (target_cpu != prev_cpu) { struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = task_cpu(p), - .dst_cpu = target_cpu, - .task = p, + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(task_cpu(p))) - return target_cpu; + if (cpu_overutilized(prev_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); + schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); + goto unlock; + } - if (energy_diff(&eenv) >= 0) - return task_cpu(p); + if (energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } + + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + goto unlock; } + schedstat_inc(p, se.statistics.nr_wakeups_secb_count); + schedstat_inc(this_rq(), eas_stats.secb_count); + +unlock: + rcu_read_unlock(); + return target_cpu; } @@ -7032,10 +7584,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return select_best_cpu(p, prev_cpu, 0, sync); #endif - if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || - energy_aware(); + if (sd_flag & SD_BALANCE_WAKE) { + /* + * do wake_cap unconditionally as it causes task and cpu + * utilization to be synced, and we need that for energy + * aware wakeups + */ + int _wake_cap = wake_cap(p, cpu, prev_cpu); + want_affine = !wake_wide(p) && !_wake_cap + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } + + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu, sync); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -7060,49 +7621,65 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); - else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; + } else { + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); } - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } + while (sd) { + struct sched_group *group; + int weight; - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ } - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); } - /* while loop will break here if sd == NULL */ } rcu_read_unlock(); @@ -8105,8 +8682,13 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, + true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8166,7 +8748,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8404,13 +8986,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, max_capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -8424,6 +9007,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -8456,6 +9040,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -8473,6 +9058,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) if (!cpu_isolated(cpumask_first(cpus))) { capacity += sgc->capacity; max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); } group = group->next; } while (group != child->groups); @@ -8480,6 +9066,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -8761,15 +9348,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + /* - * Candiate sg has no more than one task per cpu and has higher - * per-cpu capacity. No reason to pull tasks to less capable cpus. + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) return false; } +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -8820,6 +9413,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8905,7 +9501,7 @@ next_group: env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; - if (!env->sd->parent) { + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; @@ -9494,7 +10090,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved = 0, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group = NULL; struct rq *busiest = NULL; unsigned long flags; @@ -10778,6 +11374,61 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(se, 0); + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -10792,8 +11443,7 @@ static void detach_task_cfs_rq(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } - /* Catch up with the cfs_rq and remove our load when we leave */ - detach_entity_load_avg(cfs_rq, se); + detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) @@ -10801,16 +11451,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - - /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -10864,6 +11505,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -10901,8 +11545,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -10917,6 +11562,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -10930,6 +11577,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; @@ -11026,8 +11677,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index df47c26ab6d2..ae6876e62c0f 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -1602,7 +1602,7 @@ unsigned int nr_eligible_big_tasks(int cpu) int nr_big = rq->hmp_stats.nr_big_tasks; int nr = rq->nr_running; - if (cpu_max_possible_capacity(cpu) != max_possible_capacity) + if (!is_max_capacity_cpu(cpu)) return nr_big; return nr; @@ -2521,10 +2521,42 @@ static inline u32 predict_and_update_buckets(struct rq *rq, return pred_demand; } -static void update_task_cpu_cycles(struct task_struct *p, int cpu) +#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC) + +/* + * Assumes rq_lock is held and wallclock was recorded in the same critical + * section as this function's invocation. + */ +static inline u64 read_cycle_counter(int cpu, u64 wallclock) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + u64 delta; + + if (unlikely(!cluster)) + return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + + /* + * Why don't we need locking here? Let's say that delta is negative + * because some other CPU happened to update last_cc_update with a + * more recent timestamp. We simply read the conter again in that case + * with no harmful side effects. This can happen if there is an FIQ + * between when we read the wallclock and when we use it here. + */ + delta = wallclock - atomic64_read(&cluster->last_cc_update); + if (delta > THRESH_CC_UPDATE) { + atomic64_set(&cluster->cycles, + cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu)); + atomic64_set(&cluster->last_cc_update, wallclock); + } + + return atomic64_read(&cluster->cycles); +} + +static void update_task_cpu_cycles(struct task_struct *p, int cpu, + u64 wallclock) { if (use_cycle_counter) - p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + p->cpu_cycles = read_cycle_counter(cpu, wallclock); } static void @@ -2542,7 +2574,7 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, return; } - cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + cur_cycles = read_cycle_counter(cpu, wallclock); /* * If current task is idle task and irqtime == 0 CPU was @@ -2579,7 +2611,8 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); } -static int account_busy_for_task_demand(struct task_struct *p, int event) +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) { /* * No need to bother updating task demand for exiting tasks @@ -2598,6 +2631,17 @@ static int account_busy_for_task_demand(struct task_struct *p, int event) (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) return 0; + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + return 1; } @@ -2738,7 +2782,7 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq, u64 runtime; new_window = mark_start < window_start; - if (!account_busy_for_task_demand(p, event)) { + if (!account_busy_for_task_demand(rq, p, event)) { if (new_window) /* * If the time accounted isn't being accounted as @@ -2822,7 +2866,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, update_window_start(rq, wallclock); if (!p->ravg.mark_start) { - update_task_cpu_cycles(p, cpu_of(rq)); + update_task_cpu_cycles(p, cpu_of(rq), wallclock); goto done; } @@ -2890,7 +2934,7 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) if (is_idle_task(curr)) { /* We're here without rq->lock held, IRQ disabled */ raw_spin_lock(&rq->lock); - update_task_cpu_cycles(curr, cpu); + update_task_cpu_cycles(curr, cpu, sched_ktime_clock()); raw_spin_unlock(&rq->lock); } } @@ -2935,7 +2979,7 @@ void mark_task_starting(struct task_struct *p) p->ravg.mark_start = p->last_wake_ts = wallclock; p->last_cpu_selected_ts = wallclock; p->last_switch_out_ts = 0; - update_task_cpu_cycles(p, cpu_of(rq)); + update_task_cpu_cycles(p, cpu_of(rq), wallclock); } void set_window_start(struct rq *rq) @@ -3548,7 +3592,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); - update_task_cpu_cycles(p, new_cpu); + update_task_cpu_cycles(p, new_cpu, wallclock); new_task = is_new_task(p); /* Protected by rq_lock */ @@ -4303,8 +4347,20 @@ void note_task_waking(struct task_struct *p, u64 wallclock) { u64 sleep_time = wallclock - p->last_switch_out_ts; - p->last_wake_ts = wallclock; + /* + * When a short burst and short sleeping task goes for a long + * sleep, the task's avg_sleep_time gets boosted. It will not + * come below short_sleep threshold for a lot of time and it + * results in incorrect packing. The idead behind tracking + * avg_sleep_time is to detect if a task is short sleeping + * or not. So limit the sleep time to twice the short sleep + * threshold. For regular long sleeping tasks, the avg_sleep_time + * would be higher than threshold, and packing happens correctly. + */ + sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep); update_avg(&p->ravg.avg_sleep_time, sleep_time); + + p->last_wake_ts = wallclock; } #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..f8e8d68ed3fd 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -201,8 +201,9 @@ void calc_load_exit_idle(void) struct rq *this_rq = this_rq(); /* - * If we're still before the sample window, we're done. + * If we're still before the pending sample window, we're done. */ + this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -211,7 +212,6 @@ void calc_load_exit_idle(void) * accounted through the nohz accounting, so skip the entire deal and * sync up for the next window. */ - this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update + 10)) this_rq->calc_load_update += LOAD_FREQ; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 29345ed74069..c03d51a017bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,6 +5,7 @@ #include "sched.h" +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/irq_work.h> #include <trace/events/sched.h> @@ -1005,6 +1006,9 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1456,11 +1460,30 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) } #endif +/* + * Return whether the task on the given cpu is currently non-preemptible + * while handling a potentially long softint, or if the task is likely + * to block preemptions soon because it is a ksoftirq thread that is + * handling slow softints. + */ +bool +task_may_not_preempt(struct task_struct *task, int cpu) +{ + __u32 softirqs = per_cpu(active_softirqs, cpu) | + __IRQ_STAT(cpu, __softirq_pending); + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); + + return ((softirqs & LONG_SOFTIRQ_MASK) && + (task == cpu_ksoftirqd || + task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); +} + static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool may_not_preempt; #ifdef CONFIG_SCHED_HMP return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); @@ -1476,7 +1499,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr = READ_ONCE(rq->curr); /* unlocked access */ /* - * If the current task on @p's runqueue is an RT task, then + * If the current task on @p's runqueue is a softirq task, + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -1497,17 +1530,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. */ - if (curr && unlikely(rt_task(curr)) && + may_not_preempt = task_may_not_preempt(curr, cpu); + if (may_not_preempt || + (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + curr->prio <= p->prio))) { int target = find_lowest_rq(p); /* - * Don't bother moving it if the destination CPU is - * not running a lower priority task. + * If cpu is non-preemptible, prefer remote cpu + * even if it's running a higher-prio task. + * Otherwise: Don't bother moving it if the + * destination CPU is not running a lower priority task. */ if (target != -1 && - p->prio < cpu_rq(target)->rt.highest_prio.curr) + (may_not_preempt || + p->prio < cpu_rq(target)->rt.highest_prio.curr)) cpu = target; } rcu_read_unlock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 276a2387f06f..67b7da81f8a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -397,6 +397,8 @@ struct sched_cluster { unsigned int static_cluster_pwr_cost; int notifier_sent; bool wake_up_idle; + atomic64_t last_cc_update; + atomic64_t cycles; }; extern unsigned long all_cluster_ids[]; @@ -463,6 +465,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT @@ -509,7 +512,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -649,6 +652,9 @@ struct root_domain { /* Maximum cpu capacity in the system. */ struct max_cpu_capacity max_cpu_capacity; + + /* First cpu with maximum and minimum original capacity */ + int max_cap_orig_cpu, min_cap_orig_cpu; }; extern struct root_domain def_root_domain; @@ -706,6 +712,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -825,6 +832,9 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; +#ifdef CONFIG_SMP + struct eas_stats eas_stats; +#endif #endif #ifdef CONFIG_SMP @@ -995,6 +1005,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long max_capacity; /* Max per-cpu capacity in group */ + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -1225,6 +1236,11 @@ static inline bool hmp_capable(void) return max_possible_capacity != min_max_possible_capacity; } +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + /* * 'load' is in reference to "best cpu" at its best frequency. * Scale that in reference to a given cpu, accounting for how bad it is @@ -1601,6 +1617,8 @@ static inline unsigned int nr_eligible_big_tasks(int cpu) return 0; } +static inline bool is_max_capacity_cpu(int cpu) { return true; } + static inline int pct_task_load(struct task_struct *p) { return 0; } static inline int cpu_capacity(int cpu) @@ -2149,6 +2167,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); static inline void __add_nr_running(struct rq *rq, unsigned count) { @@ -2662,6 +2681,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +/* + * task_may_not_preempt - check whether a task may not be preemptible soon + */ +extern bool task_may_not_preempt(struct task_struct *task, int cpu); + #else /* CONFIG_SMP */ /* @@ -2783,3 +2807,55 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, rq_clock(rq), flags); +} + +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) +{ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_update_util(rq, flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c index 29d8a26a78ed..ba5a326a9fd8 100644 --- a/kernel/sched/sched_avg.c +++ b/kernel/sched/sched_avg.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2012, 2015-2016, The Linux Foundation. All rights reserved. +/* Copyright (c) 2012, 2015-2017, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -26,11 +26,13 @@ static DEFINE_PER_CPU(u64, nr_prod_sum); static DEFINE_PER_CPU(u64, last_time); static DEFINE_PER_CPU(u64, nr_big_prod_sum); static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); static s64 last_get_time; +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) /** * sched_get_nr_running_avg * @return: Average nr_running, iowait and nr_big_tasks value since last poll. @@ -40,7 +42,8 @@ static s64 last_get_time; * Obtains the average nr_running value since the last poll. * This function may not be called concurrently with itself */ -void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg) +void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg, + unsigned int *max_nr, unsigned int *big_max_nr) { int cpu; u64 curr_time = sched_clock(); @@ -50,6 +53,8 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg) *avg = 0; *iowait_avg = 0; *big_avg = 0; + *max_nr = 0; + *big_max_nr = 0; if (!diff) return; @@ -78,17 +83,35 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg) per_cpu(nr_big_prod_sum, cpu) = 0; per_cpu(iowait_prod_sum, cpu) = 0; + if (*max_nr < per_cpu(nr_max, cpu)) + *max_nr = per_cpu(nr_max, cpu); + + if (is_max_capacity_cpu(cpu)) { + if (*big_max_nr < per_cpu(nr_max, cpu)) + *big_max_nr = per_cpu(nr_max, cpu); + } + + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } diff = curr_time - last_get_time; last_get_time = curr_time; - *avg = (int)div64_u64(tmp_avg * 100, diff); - *big_avg = (int)div64_u64(tmp_big_avg * 100, diff); - *iowait_avg = (int)div64_u64(tmp_iowait * 100, diff); - - trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg); + /* + * Any task running on BIG cluster and BIG tasks running on little + * cluster contributes to big_avg. Small or medium tasks can also + * run on BIG cluster when co-location and scheduler boost features + * are activated. We don't want these tasks to downmigrate to little + * cluster when BIG CPUs are available but isolated. Round up the + * average values so that core_ctl aggressively unisolate BIG CPUs. + */ + *avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff); + *big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff); + *iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff); + + trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg, + *max_nr, *big_max_nr); BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0); pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n", @@ -121,6 +144,9 @@ void sched_update_nr_prod(int cpu, long delta, bool inc) BUG_ON((s64)per_cpu(nr, cpu) < 0); + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + per_cpu(nr_prod_sum, cpu) += nr_running * diff; per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff; per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f0c33e..6d74a7c77c8c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -12,6 +12,28 @@ */ #define SCHEDSTAT_VERSION 15 +#ifdef CONFIG_SMP +static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) +{ + /* eas-specific runqueue stats */ + seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ", + stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine, + stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ", + stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt, + stats->secb_insuff_cap, stats->secb_no_nrg_sav, + stats->secb_nrg_sav, stats->secb_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu ", + stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd, + stats->fbt_pref_idle, stats->fbt_count); + + seq_printf(seq, "%llu %llu\n", + stats->cas_attempts, stats->cas_count); +} +#endif + static int show_schedstat(struct seq_file *seq, void *v) { int cpu; @@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); #ifdef CONFIG_SMP + show_easstat(seq, &rq->eas_stats); + /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + + show_easstat(seq, &sd->eas_stats); } rcu_read_unlock(); #endif diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b0c5fe6d1f3b..a71e94cecdb6 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -12,11 +12,12 @@ #include "tune.h" #ifdef CONFIG_CGROUP_SCHEDTUNE -static bool schedtune_initialized = false; +bool schedtune_initialized = false; #endif unsigned int sysctl_sched_cfs_boost __read_mostly; +extern struct reciprocal_value schedtune_spc_rdiv; extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ @@ -675,6 +676,9 @@ int schedtune_task_boost(struct task_struct *p) struct schedtune *st; int task_boost; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get task boost value */ rcu_read_lock(); st = task_schedtune(p); @@ -689,6 +693,9 @@ int schedtune_prefer_idle(struct task_struct *p) struct schedtune *st; int prefer_idle; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get prefer_idle value */ rcu_read_lock(); st = task_schedtune(p); @@ -822,6 +829,7 @@ schedtune_boostgroup_init(struct schedtune *st) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; + raw_spin_lock_init(&bg->lock); } return 0; @@ -1121,9 +1129,12 @@ schedtune_init(void) pr_info("schedtune: configured to support global boosting only\n"); #endif + schedtune_spc_rdiv = reciprocal_value(100); + return 0; nodata: + pr_warning("schedtune: disabled!\n"); rcu_read_unlock(); return -EINVAL; } diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..b92a047ddc82 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,7 +503,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, + bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -525,6 +526,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); + + *resched_timer = + (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER) && + (info->si_sys_private); + __sigqueue_free(first); } else { /* @@ -541,12 +548,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) + siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info); + collect_signal(sig, pending, info, resched_timer); return sig; } @@ -558,15 +565,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { + bool resched_timer = false; int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); + mask, info, &resched_timer); /* * itimer signal ? * @@ -611,7 +619,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + if (resched_timer) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..39ffd41594ce 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +/* + * active_softirqs -- per cpu, a mask of softirqs that are being handled, + * with the expectation that approximate answers are acceptable and therefore + * no synchronization. + */ +DEFINE_PER_CPU(__u32, active_softirqs); + const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" @@ -253,6 +260,7 @@ asmlinkage __visible void __do_softirq(void) restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); + __this_cpu_write(active_softirqs, pending); local_irq_enable(); @@ -282,6 +290,7 @@ restart: pending >>= softirq_bit; } + __this_cpu_write(active_softirqs, 0); rcu_bh_qs(); local_irq_disable(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 816999804a16..f27d2ba78d14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -522,13 +522,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_granularity_ns, }, { - .procname = "sched_is_big_little", - .data = &sysctl_sched_is_big_little, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, .maxlen = sizeof(unsigned int), diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2af5687b83c9..1a4de0022cc5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -569,7 +569,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add(start, base->gettime()); + start = ktime_add_safe(start, base->gettime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -655,7 +655,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) overrun++; } - alarm->node.expires = ktime_add(alarm->node.expires, interval); + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); @@ -843,13 +843,21 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + + /* + * Rate limit to the tick as a hot fix to prevent DOS. Will be + * mopped up later. + */ + if (ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC) + timr->it.alarm.interval = ktime_set(0, TICK_NSEC); + exp = timespec_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add(now, exp); + exp = ktime_add_safe(now, exp); } alarm_start(&timr->it.alarm.alarmtimer, exp); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 01a49614e942..e7c2392666cb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -49,7 +49,6 @@ #include <linux/sched/deadline.h> #include <linux/timer.h> #include <linux/freezer.h> -#include <linux/delay.h> #include <asm/uaccess.h> @@ -1593,42 +1592,22 @@ static void init_hrtimers_cpu(int cpu) } #if defined(CONFIG_HOTPLUG_CPU) -static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base, - unsigned int i, - bool wait, +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; struct timerqueue_head pinned; int is_pinned; - struct hrtimer_clock_base *old_c_base = &old_base->clock_base[i]; - struct hrtimer_clock_base *new_c_base = &new_base->clock_base[i]; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); timerqueue_init_head(&pinned); - while ((node = timerqueue_getnext(&old_c_base->active))) { + while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); - if (wait) { - /* Ensure timers are done running before continuing */ - while (hrtimer_callback_running(timer)) { - raw_spin_unlock(&old_base->lock); - raw_spin_unlock(&new_base->lock); - cpu_relax(); - /* - * cpu_relax may just be a barrier. Grant the - * run_hrtimer_list code some time to obtain the - * spinlock. - */ - udelay(2); - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, - SINGLE_DEPTH_NESTING); - } - } else { + if (is_hotplug) BUG_ON(hrtimer_callback_running(timer)); - } debug_deactivate(timer); /* @@ -1636,7 +1615,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base, * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_c_base, HRTIMER_STATE_ENQUEUED, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); is_pinned = timer->state & HRTIMER_STATE_PINNED; if (!remove_pinned && is_pinned) { @@ -1644,7 +1623,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base, continue; } - timer->base = new_c_base; + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer @@ -1653,7 +1632,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_c_base); + enqueue_hrtimer(timer, new_base); } /* Re-queue pinned timers for non-hotplug usecase */ @@ -1661,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base, timer = container_of(node, struct hrtimer, node); timerqueue_del(&pinned, &timer->node); - enqueue_hrtimer(timer, old_c_base); + enqueue_hrtimer(timer, old_base); } } -static void __migrate_hrtimers(int scpu, bool wait, bool remove_pinned) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; unsigned long flags; @@ -1682,8 +1661,8 @@ static void __migrate_hrtimers(int scpu, bool wait, bool remove_pinned) raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(old_base, new_base, i, wait, - remove_pinned); + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1699,12 +1678,12 @@ static void migrate_hrtimers(int scpu) BUG_ON(cpu_online(scpu)); tick_cancel_sched_timer(scpu); - __migrate_hrtimers(scpu, false, true); + __migrate_hrtimers(scpu, true); } void hrtimer_quiesce_cpu(void *cpup) { - __migrate_hrtimers(*(int *)cpup, true, false); + __migrate_hrtimers(*(int *)cpup, false); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 80016b329d94..051544aec37c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1250,7 +1250,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - unsigned long long now; + unsigned long long now = 0; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fa544f3f560..738f3467d169 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -116,6 +116,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -173,7 +193,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) */ do { seq = read_seqcount_begin(&tk_core.seq); - now = tkr->read(tkr->clock); + now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; @@ -207,7 +227,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) cycle_t cycle_now, delta; /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -235,12 +255,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; - tk->tkr_mono.read = clock->read; tk->tkr_mono.mask = clock->mask; - tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; - tk->tkr_raw.read = clock->read; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) now += timekeeping_delta_to_ns(tkr, clocksource_delta( - tkr->read(tkr->clock), + tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +479,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs) return cycles_at_suspend; } +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk) struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - cycles_at_suspend = tkr->read(tkr->clock); - tkr_dummy.read = dummy_clock_read; + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - tkr_dummy.read = dummy_clock_read; + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } @@ -647,11 +669,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; s64 nsec; - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -1434,7 +1455,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { u64 num, max = ULLONG_MAX; @@ -1829,7 +1850,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..12ea4ea619ee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1471,6 +1471,11 @@ static __init int kprobe_trace_self_tests_init(void) end: release_all_trace_kprobes(); + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else |