diff options
Diffstat (limited to 'kernel')
65 files changed, 11549 insertions, 266 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dc94f8beb097..e8d71110ed2a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2006,7 +2006,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; - struct cgroup_root *root; + struct cgroup_root *root = NULL; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; @@ -2671,6 +2671,45 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } +int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + if (capable(CAP_SYS_NICE)) + return 0; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if (current != task && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + return -EACCES; + } + + return 0; +} + +static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + int i; + int ret; + + for_each_css(css, i, cgrp) { + if (css->ss->allow_attach) { + ret = css->ss->allow_attach(tset); + if (ret) + return ret; + } else { + return -EACCES; + } + } + + return 0; +} + static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2685,8 +2724,24 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - ret = -EACCES; + !uid_eq(cred->euid, tcred->suid)) { + /* + * if the default permission check fails, give each + * cgroup a chance to extend the permission check + */ + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; + struct css_set *cset; + cset = task_css_set(task); + list_add(&cset->mg_node, &tset.src_csets); + ret = cgroup_allow_attach(dst_cgrp, &tset); + list_del(&tset.src_csets); + if (ret) + ret = -EACCES; + } if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..37731292f8a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,8 @@ #include <linux/irq.h> #include <trace/events/power.h> +#include <trace/events/sched.h> + #include "smpboot.h" #ifdef CONFIG_SMP @@ -425,6 +427,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) out_release: cpu_hotplug_done(); + trace_sched_cpu_hotplug(cpu, err, 0); if (!err) cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); return err; @@ -530,6 +533,7 @@ out_notify: __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); out: cpu_hotplug_done(); + trace_sched_cpu_hotplug(cpu, ret, 1); return ret; } @@ -827,3 +831,23 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_online_bits), src); } + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void idle_notifier_call_chain(unsigned long val) +{ + atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(idle_notifier_call_chain); diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 009cc9a17d95..774bfe7a2893 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -22,14 +22,17 @@ #include <linux/spinlock.h> #include <linux/syscore_ops.h> +bool from_suspend = false; + static DEFINE_RWLOCK(cpu_pm_notifier_lock); static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls, + void *data) { int ret; - ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, data, nr_to_call, nr_calls); return notifier_to_errno(ret); @@ -101,13 +104,13 @@ int cpu_pm_enter(void) int ret = 0; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); + ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls, NULL); if (ret) /* * Inform listeners (nr_calls - 1) about failure of CPU PM * PM entry who are notified earlier to prepare for it. */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); + cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL, NULL); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -131,7 +134,7 @@ int cpu_pm_exit(void) int ret; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL, NULL); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -154,19 +157,21 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); * * Return conditions are same as __raw_notifier_call_chain. */ -int cpu_cluster_pm_enter(void) +int cpu_cluster_pm_enter(unsigned long aff_level) { int nr_calls; int ret = 0; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); + ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls, + (void *) aff_level); if (ret) /* * Inform listeners (nr_calls - 1) about failure of CPU cluster * PM entry who are notified earlier to prepare for it. */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); + cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL, + (void *) aff_level); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -188,12 +193,12 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * * Return conditions are same as __raw_notifier_call_chain. */ -int cpu_cluster_pm_exit(void) +int cpu_cluster_pm_exit(unsigned long aff_level) { int ret; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL, (void *) aff_level); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -205,17 +210,19 @@ static int cpu_pm_suspend(void) { int ret; + from_suspend = true; ret = cpu_pm_enter(); if (ret) return ret; - ret = cpu_cluster_pm_enter(); + ret = cpu_cluster_pm_enter(0); return ret; } static void cpu_pm_resume(void) { - cpu_cluster_pm_exit(); + from_suspend = false; + cpu_cluster_pm_exit(0); cpu_pm_exit(); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..0b891286a150 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int i; int diag, dtab_count; int key; - + static int last_crlf; diag = kdbgetintenv("DTABCOUNT", &dtab_count); if (diag) @@ -237,6 +237,9 @@ poll_again: return buffer; if (key != 9) tab = 0; + if (key != 10 && key != 13) + last_crlf = 0; + switch (key) { case 8: /* backspace */ if (cp > buffer) { @@ -254,7 +257,12 @@ poll_again: *cp = tmp; } break; - case 13: /* enter */ + case 10: /* new line */ + case 13: /* carriage return */ + /* handle \n after \r */ + if (last_crlf && last_crlf != key) + break; + last_crlf = key; *lastchar++ = '\n'; *lastchar++ = '\0'; if (!KDB_STATE(KGDB_TRANS)) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 1e889a078dbc..96100cc046c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -176,7 +176,11 @@ static struct srcu_struct pmus_srcu; * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ +#ifdef CONFIG_PERF_EVENTS_USERMODE +int sysctl_perf_event_paranoid __read_mostly = -1; +#else int sysctl_perf_event_paranoid __read_mostly = 1; +#endif /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -1657,7 +1661,32 @@ static int __perf_remove_from_context(void *info) } -/* +#ifdef CONFIG_SMP +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ + int up_ret; + /* + * CPU was offline. Bring it online so we can + * gracefully exit a perf context. + */ + up_ret = cpu_up(event->cpu); + if (!up_ret) + /* Try the remove call once again. */ + cpu_function_call(event->cpu, __perf_remove_from_context, + rep); + else + pr_err("Failed to bring up CPU: %d, ret: %d\n", + event->cpu, up_ret); +} +#else +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ +} +#endif + + /* * Remove the event from a task's (or a CPU's) list of events. * * CPU events are removed with a smp call. For task events we only @@ -1670,7 +1699,8 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void __ref perf_remove_from_context(struct perf_event *event, + bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1678,6 +1708,7 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group .event = event, .detach_group = detach_group, }; + int ret; lockdep_assert_held(&ctx->mutex); @@ -1688,7 +1719,11 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group * already called __perf_remove_from_context from * perf_event_exit_cpu. */ - cpu_function_call(event->cpu, __perf_remove_from_context, &re); + ret = cpu_function_call(event->cpu, __perf_remove_from_context, + &re); + if (ret == -ENXIO) + perf_retry_remove(event, &re); + return; } @@ -3460,7 +3495,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() && + !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* @@ -3844,6 +3880,15 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { + struct perf_event *event = file->private_data; + + /* + * Event can be in state OFF because of a constraint check. + * Change to ACTIVE so that it gets cleaned up correctly. + */ + if ((event->state == PERF_EVENT_STATE_OFF) && + event->attr.constraint_duplicate) + event->state = PERF_EVENT_STATE_ACTIVE; put_event(file->private_data); return 0; } @@ -6920,6 +6965,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; #ifdef CONFIG_EVENT_TRACING @@ -7041,6 +7088,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; static inline void perf_tp_register(void) @@ -7319,6 +7368,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .events_across_hotplug = 1, }; /* @@ -7400,6 +7451,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .events_across_hotplug = 1, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -8272,6 +8325,9 @@ SYSCALL_DEFINE5(perf_event_open, if (err) return err; + if (attr.constraint_duplicate || attr.__reserved_1) + return -EINVAL; + if (!attr.exclude_kernel) { if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; @@ -9302,6 +9358,18 @@ static void __perf_event_exit_context(void *__info) rcu_read_unlock(); } +static void __perf_event_stop_swclock(void *__info) +{ + struct perf_event_context *ctx = __info; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK && + event->attr.type == PERF_TYPE_SOFTWARE) + cpu_clock_event_stop(event, 0); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_event_context *ctx; @@ -9311,20 +9379,56 @@ static void perf_event_exit_cpu_context(int cpu) idx = srcu_read_lock(&pmus_srcu); list_for_each_entry_rcu(pmu, &pmus, entry) { ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + /* + * If keeping events across hotplugging is supported, do not + * remove the event list, but keep it alive across CPU hotplug. + * The context is exited via an fd close path when userspace + * is done and the target CPU is online. If software clock + * event is active, then stop hrtimer associated with it. + * Start the timer when the CPU comes back online. + */ + if (!pmu->events_across_hotplug) + smp_call_function_single(cpu, __perf_event_exit_context, + ctx, 1); + else + smp_call_function_single(cpu, __perf_event_stop_swclock, + ctx, 1); mutex_unlock(&ctx->mutex); } srcu_read_unlock(&pmus_srcu, idx); } +static void perf_event_start_swclock(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + struct perf_event *event, *tmp; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->events_across_hotplug) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + list_for_each_entry_safe(event, tmp, &ctx->event_list, + event_entry) { + if (event->attr.config == + PERF_COUNT_SW_CPU_CLOCK && + event->attr.type == PERF_TYPE_SOFTWARE) + cpu_clock_event_start(event, 0); + } + } + } + srcu_read_unlock(&pmus_srcu, idx); +} + static void perf_event_exit_cpu(int cpu) { perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } +static inline void perf_event_start_swclock(int cpu) { } #endif static int @@ -9363,6 +9467,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; + + case CPU_STARTING: + perf_event_start_swclock(cpu); + break; + default: break; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..7da5b674d16e 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -614,6 +614,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .events_across_hotplug = 1, }; int __init init_hw_breakpoint(void) diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..a32e83d567b9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -388,6 +388,7 @@ static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; + int mm_released; mm_release(tsk, mm); if (!mm) @@ -434,9 +435,12 @@ static void exit_mm(struct task_struct *tsk) enter_lazy_tlb(mm, current); task_unlock(tsk); mm_update_next_owner(mm); - mmput(mm); + + mm_released = mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); + if (mm_released) + set_tsk_thread_flag(tsk, TIF_MM_RELEASED); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -632,6 +636,7 @@ static void check_stack_usage(void) static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; + int islower = false; free = stack_not_used(current); @@ -640,11 +645,16 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", - current->comm, task_pid_nr(current), free); lowest_to_date = free; + islower = true; } spin_unlock(&low_water_lock); + + if (islower) { + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); + } } #else static inline void check_stack_usage(void) {} @@ -699,6 +709,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + sched_exit(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/fork.c b/kernel/fork.c index 1155eac61687..c9eb86b646ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> +#include <linux/kasan.h> #include <linux/key.h> #include <linux/binfmts.h> #include <linux/mman.h> @@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { + kasan_alloc_pages(virt_to_page(ti), THREAD_SIZE_ORDER); free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else @@ -692,8 +694,9 @@ EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. */ -void mmput(struct mm_struct *mm) +int mmput(struct mm_struct *mm) { + int mm_freed = 0; might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { @@ -711,7 +714,9 @@ void mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); + mm_freed = 1; } + return mm_freed; } EXPORT_SYMBOL_GPL(mmput); @@ -800,7 +805,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + !ptrace_may_access(task, mode) && + !capable(CAP_SYS_RESOURCE)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6ead200370da..5cb153a8474a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -319,6 +319,9 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); + if (!notify && old_notify) + cancel_work_sync(&old_notify->work); + if (old_notify) kref_put(&old_notify->kref, old_notify->release); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 0374a596cffa..d381f559e0ce 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,8 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/bug.h> +#include <soc/qcom/watchdog.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key) @@ -64,6 +66,11 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg) owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, lock->owner_cpu); +#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG + msm_trigger_wdog_bite(); +#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG) + BUG(); +#endif dump_stack(); } @@ -114,7 +121,7 @@ static void __spin_lock_debug(raw_spinlock_t *lock) __delay(1); } /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); + spin_bug(lock, "lockup suspected"); #ifdef CONFIG_SMP trigger_all_cpu_backtrace(); #endif @@ -167,6 +174,11 @@ static void rwlock_bug(rwlock_t *lock, const char *msg) printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current), lock); +#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG + msm_trigger_wdog_bite(); +#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG) + BUG(); +#endif dump_stack(); } diff --git a/kernel/module.c b/kernel/module.c index 0e5c71195f18..fe5248ab3378 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2499,7 +2499,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* We'll tack temporary mod_kallsyms on the end. */ mod->init_size = ALIGN(mod->init_size, - __alignof__(struct mod_kallsyms)); + __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_size; mod->init_size += sizeof(struct mod_kallsyms); mod->init_size = debug_align(mod->init_size); @@ -2578,7 +2578,13 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } -#ifdef CONFIG_DEBUG_KMEMLEAK +#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + kmemleak_no_scan(mod->module_core); +} +#elif defined(CONFIG_DEBUG_KMEMLEAK) static void kmemleak_load_module(const struct module *mod, const struct load_info *info) { diff --git a/kernel/panic.c b/kernel/panic.c index 41e2b54f36b5..223564d3e1f8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -28,6 +28,9 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +/* Machine specific panic information string */ +char *mach_panic_string; + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; @@ -412,6 +415,11 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); + + if (mach_panic_string) + printk(KERN_WARNING "Board Information: %s\n", + mach_panic_string); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..84c480946fb2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -28,6 +28,15 @@ config SUSPEND_SKIP_SYNC of suspend, or they are content with invoking sync() from user-space before invoking suspend. Say Y if that's your case. +config WAKELOCK + bool "Android's method of preventing suspend" + default y + ---help--- + This allows applications to prevent the CPU from suspending while + they need it. + + Say Y if you are running an android userspace. + config HIBERNATE_CALLBACKS bool diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..22eb9ed879ad 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -12,3 +12,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_SUSPEND) += wakeup_reason.o diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..e7f1f736a5b6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -18,6 +18,7 @@ #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> +#include <linux/wakeup_reason.h> /* * Timeout for stopping processes @@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only) unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; +#ifdef CONFIG_PM_SLEEP + char suspend_abort[MAX_SUSPEND_ABORT_LEN]; +#endif do_gettimeofday(&start); @@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only) break; if (pm_wakeup_pending()) { +#ifdef CONFIG_PM_SLEEP + pm_get_active_wakeup_sources(suspend_abort, + MAX_SUSPEND_ABORT_LEN); + log_suspend_abort_reason(suspend_abort); +#endif wakeup = true; break; } @@ -83,15 +92,17 @@ static int try_to_freeze_tasks(bool user_only) do_div(elapsed_msecs64, NSEC_PER_MSEC); elapsed_msecs = elapsed_msecs64; - if (todo) { + if (wakeup) { pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", - wakeup ? "aborted" : "failed", + pr_err("Freezing of tasks aborted after %d.%03d seconds", + elapsed_msecs / 1000, elapsed_msecs % 1000); + } else if (todo) { + pr_cont("\n"); + pr_err("Freezing of tasks failed after %d.%03d seconds" + " (%d tasks refusing to freeze, wq_busy=%d):\n", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); - if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) @@ -99,7 +110,6 @@ static int try_to_freeze_tasks(bool user_only) sched_show_task(p); } read_unlock(&tasklist_lock); - } } else { pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..8ecc7b3f7dd9 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -43,6 +43,8 @@ #include <linux/kernel.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -67,6 +69,8 @@ static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_constraints cpu_dma_constraints = { .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE }, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, @@ -81,6 +85,8 @@ static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_constraints network_lat_constraints = { .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_NETWORK_LAT_DEFAULT_VALUE }, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, @@ -91,11 +97,12 @@ static struct pm_qos_object network_lat_pm_qos = { .name = "network_latency", }; - static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_constraints network_tput_constraints = { .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE }, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, @@ -259,22 +266,60 @@ static const struct file_operations pm_qos_debug_fops = { .release = single_release, }; +static inline void pm_qos_set_value_for_cpus(struct pm_qos_constraints *c, + struct cpumask *cpus) +{ + struct pm_qos_request *req = NULL; + int cpu; + s32 qos_val[NR_CPUS] = { [0 ... (NR_CPUS - 1)] = c->default_value }; + + plist_for_each_entry(req, &c->list, node) { + for_each_cpu(cpu, &req->cpus_affine) { + switch (c->type) { + case PM_QOS_MIN: + if (qos_val[cpu] > req->node.prio) + qos_val[cpu] = req->node.prio; + break; + case PM_QOS_MAX: + if (req->node.prio > qos_val[cpu]) + qos_val[cpu] = req->node.prio; + break; + case PM_QOS_SUM: + qos_val[cpu] += req->node.prio; + break; + default: + BUG(); + break; + } + } + } + + for_each_possible_cpu(cpu) { + if (c->target_per_cpu[cpu] != qos_val[cpu]) + cpumask_set_cpu(cpu, cpus); + c->target_per_cpu[cpu] = qos_val[cpu]; + } +} + /** * pm_qos_update_target - manages the constraints list and calls the notifiers * if needed * @c: constraints data struct - * @node: request to add to the list, to update or to remove + * @req: request to add to the list, to update or to remove * @action: action to take on the constraints list * @value: value of the request to add or update * * This function returns 1 if the aggregated constraint value has changed, 0 * otherwise. */ -int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, - enum pm_qos_req_action action, int value) +int pm_qos_update_target(struct pm_qos_constraints *c, + struct pm_qos_request *req, + enum pm_qos_req_action action, int value) { unsigned long flags; int prev_value, curr_value, new_value; + struct plist_node *node = &req->node; + struct cpumask cpus; int ret; spin_lock_irqsave(&pm_qos_lock, flags); @@ -305,7 +350,9 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, } curr_value = pm_qos_get_value(c); + cpumask_clear(&cpus); pm_qos_set_value(c, curr_value); + pm_qos_set_value_for_cpus(c, &cpus); spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -315,7 +362,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, if (c->notifiers) blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, - NULL); + &cpus); } else { ret = 0; } @@ -398,12 +445,50 @@ int pm_qos_request(int pm_qos_class) } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_for_cpu(int pm_qos_class, int cpu) +{ + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; +} +EXPORT_SYMBOL(pm_qos_request_for_cpu); + int pm_qos_request_active(struct pm_qos_request *req) { return req->pm_qos_class != 0; } EXPORT_SYMBOL_GPL(pm_qos_request_active); +int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) +{ + unsigned long irqflags; + int cpu; + struct pm_qos_constraints *c = NULL; + int val; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + c = pm_qos_array[pm_qos_class]->constraints; + val = c->default_value; + + for_each_cpu(cpu, mask) { + switch (c->type) { + case PM_QOS_MIN: + if (c->target_per_cpu[cpu] < val) + val = c->target_per_cpu[cpu]; + break; + case PM_QOS_MAX: + if (c->target_per_cpu[cpu] > val) + val = c->target_per_cpu[cpu]; + break; + default: + BUG(); + break; + } + } + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + return val; +} +EXPORT_SYMBOL(pm_qos_request_for_cpumask); + static void __pm_qos_update_request(struct pm_qos_request *req, s32 new_value) { @@ -412,7 +497,7 @@ static void __pm_qos_update_request(struct pm_qos_request *req, if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + req, PM_QOS_UPDATE_REQ, new_value); } /** @@ -430,6 +515,41 @@ static void pm_qos_work_fn(struct work_struct *work) __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); } +#ifdef CONFIG_SMP +static void pm_qos_irq_release(struct kref *ref) +{ + unsigned long flags; + struct irq_affinity_notify *notify = container_of(ref, + struct irq_affinity_notify, kref); + struct pm_qos_request *req = container_of(notify, + struct pm_qos_request, irq_notify); + struct pm_qos_constraints *c = + pm_qos_array[req->pm_qos_class]->constraints; + + spin_lock_irqsave(&pm_qos_lock, flags); + cpumask_setall(&req->cpus_affine); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, c->default_value); +} + +static void pm_qos_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + unsigned long flags; + struct pm_qos_request *req = container_of(notify, + struct pm_qos_request, irq_notify); + struct pm_qos_constraints *c = + pm_qos_array[req->pm_qos_class]->constraints; + + spin_lock_irqsave(&pm_qos_lock, flags); + cpumask_copy(&req->cpus_affine, mask); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio); +} +#endif + /** * pm_qos_add_request - inserts new qos request into the list * @req: pointer to a preallocated handle @@ -453,11 +573,56 @@ void pm_qos_add_request(struct pm_qos_request *req, WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); return; } + + switch (req->type) { + case PM_QOS_REQ_AFFINE_CORES: + if (cpumask_empty(&req->cpus_affine)) { + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + WARN(1, KERN_ERR "Affine cores not set for request with affinity flag\n"); + } + break; +#ifdef CONFIG_SMP + case PM_QOS_REQ_AFFINE_IRQ: + if (irq_can_set_affinity(req->irq)) { + int ret = 0; + struct irq_desc *desc = irq_to_desc(req->irq); + struct cpumask *mask = desc->irq_data.common->affinity; + + /* Get the current affinity */ + cpumask_copy(&req->cpus_affine, mask); + req->irq_notify.irq = req->irq; + req->irq_notify.notify = pm_qos_irq_notify; + req->irq_notify.release = pm_qos_irq_release; + + ret = irq_set_affinity_notifier(req->irq, + &req->irq_notify); + if (ret) { + WARN(1, KERN_ERR "IRQ affinity notify set failed\n"); + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + } + } else { + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + WARN(1, KERN_ERR "IRQ-%d not set for request with affinity flag\n", + req->irq); + } + break; +#endif + default: + WARN(1, KERN_ERR "Unknown request type %d\n", req->type); + /* fall through */ + case PM_QOS_REQ_ALL_CORES: + cpumask_setall(&req->cpus_affine); + break; + } + req->pm_qos_class = pm_qos_class; INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); trace_pm_qos_add_request(pm_qos_class, value); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); + req, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -511,7 +676,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + req, PM_QOS_UPDATE_REQ, new_value); schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); } @@ -531,15 +696,25 @@ void pm_qos_remove_request(struct pm_qos_request *req) /* silent return to keep pcm code cleaner */ if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + WARN(1, "pm_qos_remove_request() called for unknown object\n"); return; } cancel_delayed_work_sync(&req->work); +#ifdef CONFIG_SMP + if (req->type == PM_QOS_REQ_AFFINE_IRQ) { + int ret = 0; + /* Get the current affinity */ + ret = irq_set_affinity_notifier(req->irq, NULL); + if (ret) + WARN(1, "IRQ affinity notify set failed\n"); + } +#endif + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, + req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..024411816ccf 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,9 +26,11 @@ #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/ftrace.h> +#include <linux/rtc.h> #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/wakeup_reason.h> #include "power.h" @@ -312,7 +314,8 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - int error; + char suspend_abort[MAX_SUSPEND_ABORT_LEN]; + int error, last_dev; error = platform_suspend_prepare(state); if (error) @@ -320,7 +323,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; printk(KERN_ERR "PM: late suspend of devices failed\n"); + log_suspend_abort_reason("%s device failed to power down", + suspend_stats.failed_devs[last_dev]); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +336,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + log_suspend_abort_reason("noirq suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -353,8 +364,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } error = disable_nonboot_cpus(); - if (error || suspend_test(TEST_CPUS)) + if (error || suspend_test(TEST_CPUS)) { + log_suspend_abort_reason("Disabling non-boot cpus failed"); goto Enable_cpus; + } arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -370,6 +383,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) state, false); events_check_enabled = false; } else if (*wakeup) { + pm_get_active_wakeup_sources(suspend_abort, + MAX_SUSPEND_ABORT_LEN); + log_suspend_abort_reason(suspend_abort); error = -EBUSY; } syscore_resume(); @@ -417,6 +433,7 @@ int suspend_devices_and_enter(suspend_state_t state) error = dpm_suspend_start(PMSG_SUSPEND); if (error) { pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected"); goto Recover_platform; } suspend_test_finish("suspend devices"); @@ -518,6 +535,18 @@ static int enter_state(suspend_state_t state) return error; } +static void pm_suspend_marker(char *annotation) +{ + struct timespec ts; + struct rtc_time tm; + + getnstimeofday(&ts); + rtc_time_to_tm(ts.tv_sec, &tm); + pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n", + annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec); +} + /** * pm_suspend - Externally visible function for suspending the system. * @state: System sleep state to enter. @@ -532,6 +561,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pm_suspend_marker("entry"); error = enter_state(state); if (error) { suspend_stats.fail++; @@ -539,6 +569,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } + pm_suspend_marker("exit"); return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000000000000..252611fad2fe --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,225 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/wakeup_reason.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> +#include <linux/suspend.h> + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static bool suspend_abort; +static char abort_reason[MAX_SUSPEND_ABORT_LEN]; +static struct kobject *wakeup_reason; +static DEFINE_SPINLOCK(resume_reason_lock); + +static ktime_t last_monotime; /* monotonic time before last suspend */ +static ktime_t curr_monotime; /* monotonic time after last suspend */ +static ktime_t last_stime; /* monotonic boottime offset before last suspend */ +static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ + +static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(&resume_reason_lock); + if (suspend_abort) { + buf_offset = sprintf(buf, "Abort: %s", abort_reason); + } else { + for (irq_no = 0; irq_no < irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc && desc->action && desc->action->name) + buf_offset += sprintf(buf + buf_offset, "%d %s\n", + irq_list[irq_no], desc->action->name); + else + buf_offset += sprintf(buf + buf_offset, "%d\n", + irq_list[irq_no]); + } + } + spin_unlock(&resume_reason_lock); + return buf_offset; +} + +static ssize_t last_suspend_time_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct timespec sleep_time; + struct timespec total_time; + struct timespec suspend_resume_time; + + /* + * total_time is calculated from monotonic bootoffsets because + * unlike CLOCK_MONOTONIC it include the time spent in suspend state. + */ + total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime)); + + /* + * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC) + * time interval before entering suspend and post suspend. + */ + suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime)); + + /* sleep_time = total_time - suspend_resume_time */ + sleep_time = timespec_sub(total_time, suspend_resume_time); + + /* Export suspend_resume_time and sleep_time in pair here. */ + return sprintf(buf, "%lu.%09lu %lu.%09lu\n", + suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec, + sleep_time.tv_sec, sleep_time.tv_nsec); +} + +static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason); +static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time); + +static struct attribute *attrs[] = { + &resume_reason.attr, + &suspend_time.attr, + NULL, +}; +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +/* + * logs all the wake up reasons to the kernel + * stores the irqs to expose them to the userspace via sysfs + */ +void log_wakeup_reason(int irq) +{ + struct irq_desc *desc; + desc = irq_to_desc(irq); + if (desc && desc->action && desc->action->name) + printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq, + desc->action->name); + else + printk(KERN_INFO "Resume caused by IRQ %d\n", irq); + + spin_lock(&resume_reason_lock); + if (irqcount == MAX_WAKEUP_REASON_IRQS) { + spin_unlock(&resume_reason_lock); + printk(KERN_WARNING "Resume caused by more than %d IRQs\n", + MAX_WAKEUP_REASON_IRQS); + return; + } + + irq_list[irqcount++] = irq; + spin_unlock(&resume_reason_lock); +} + +int check_wakeup_reason(int irq) +{ + int irq_no; + int ret = false; + + spin_lock(&resume_reason_lock); + for (irq_no = 0; irq_no < irqcount; irq_no++) + if (irq_list[irq_no] == irq) { + ret = true; + break; + } + spin_unlock(&resume_reason_lock); + return ret; +} + +void log_suspend_abort_reason(const char *fmt, ...) +{ + va_list args; + + spin_lock(&resume_reason_lock); + + //Suspend abort reason has already been logged. + if (suspend_abort) { + spin_unlock(&resume_reason_lock); + return; + } + + suspend_abort = true; + va_start(args, fmt); + vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args); + va_end(args); + spin_unlock(&resume_reason_lock); +} + +/* Detects a suspend and clears all the previous wake up reasons*/ +static int wakeup_reason_pm_event(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_SUSPEND_PREPARE: + spin_lock(&resume_reason_lock); + irqcount = 0; + suspend_abort = false; + spin_unlock(&resume_reason_lock); + /* monotonic time since boot */ + last_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + last_stime = ktime_get_boottime(); + break; + case PM_POST_SUSPEND: + /* monotonic time since boot */ + curr_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + curr_stime = ktime_get_boottime(); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block wakeup_reason_pm_notifier_block = { + .notifier_call = wakeup_reason_pm_event, +}; + +/* Initializes the sysfs parameter + * registers the pm_event notifier + */ +int __init wakeup_reason_init(void) +{ + int retval; + + retval = register_pm_notifier(&wakeup_reason_pm_notifier_block); + if (retval) + printk(KERN_WARNING "[%s] failed to register PM notifier %d\n", + __func__, retval); + + wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj); + if (!wakeup_reason) { + printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n", + __func__); + return 1; + } + retval = sysfs_create_group(wakeup_reason, &attr_group); + if (retval) { + kobject_put(wakeup_reason); + printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n", + __func__, retval); + } + return 0; +} + +late_initcall(wakeup_reason_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c048e34b177f..7b884dc55bd0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,10 @@ #include "console_cmdline.h" #include "braille.h" +#ifdef CONFIG_EARLY_PRINTK_DIRECT +extern void printascii(char *); +#endif + int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ @@ -232,7 +236,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -273,11 +281,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; @@ -1754,6 +1758,10 @@ asmlinkage int vprintk_emit(int facility, int level, } } +#ifdef CONFIG_EARLY_PRINTK_DIRECT + printascii(text); +#endif + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; @@ -2130,8 +2138,12 @@ static int console_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: + case CPU_DYING: +#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG console_lock(); console_unlock(); +#endif + break; } return NOTIFY_OK; } diff --git a/kernel/resource.c b/kernel/resource.c index 249b1eb1e6e1..4c9835c09dcd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -163,7 +163,7 @@ static const struct file_operations proc_iomem_operations = { static int __init ioresources_init(void) { proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); + proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations); return 0; } __initcall(ioresources_init); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..846c15156616 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70e5e09341f1..58303b3dc356 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,9 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/cpufreq.h> +#include <linux/syscore_ops.h> +#include <linux/list_sort.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -82,6 +85,9 @@ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif +#ifdef CONFIG_MSM_APP_SETTINGS +#include <asm/app_api.h> +#endif #include "sched.h" #include "../workqueue_internal.h" @@ -90,6 +96,16 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", + "RQ_TO_RQ", "GROUP_TO_GROUP"}; + +ATOMIC_NOTIFIER_HEAD(migration_notifier_head); +ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); + DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -762,8 +778,202 @@ void sched_avg_update(struct rq *rq) } } +/* + * Note C-state for (idle) cpus. + * + * @cstate = cstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cpu + * @wakeup_latency = latency to wakeup from cstate + * + */ +void +sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) +{ + struct rq *rq = cpu_rq(cpu); + + rq->cstate = cstate; /* C1, C2 etc */ + rq->wakeup_energy = wakeup_energy; + rq->wakeup_latency = wakeup_latency; +} + #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static ktime_t ktime_last; +static bool sched_ktime_suspended; + +static bool use_cycle_counter; +static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) +{ + if (p == rq->ed_task) + rq->ed_task = NULL; +} + +static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) +{ + p->last_wake_ts = wallclock; +} + +static inline void set_task_last_switch_out(struct task_struct *p, + u64 wallclock) +{ + p->last_switch_out_ts = wallclock; +} + +/* + * Note D-state for (idle) cluster. + * + * @dstate = dstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cluster + * @wakeup_latency = latency to wakeup from cluster + * + */ +void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, + int wakeup_energy, int wakeup_latency) +{ + struct sched_cluster *cluster = + cpu_rq(cpumask_first(cluster_cpus))->cluster; + cluster->dstate = dstate; + cluster->dstate_wakeup_energy = wakeup_energy; + cluster->dstate_wakeup_latency = wakeup_latency; +} + +u32 __weak get_freq_max_load(int cpu, u32 freq) +{ + /* 100% by default */ + return 100; +} + +DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); +static DEFINE_SPINLOCK(freq_max_load_lock); + +int sched_update_freq_max_load(const cpumask_t *cpumask) +{ + int i, cpu, ret; + unsigned int freq; + struct cpu_pstate_pwr *costs; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct freq_max_load *max_load, *old_max_load; + struct freq_max_load_entry *entry; + u64 max_demand_capacity, max_demand; + unsigned long flags; + u32 hfreq; + int hpct; + + if (!per_cpu_info) + return 0; + + spin_lock_irqsave(&freq_max_load_lock, flags); + max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); + for_each_cpu(cpu, cpumask) { + if (!per_cpu_info[cpu].ptable) { + ret = -EINVAL; + goto fail; + } + + old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + + /* + * allocate len + 1 and leave the last power cost as 0 for + * power_cost() can stop iterating index when + * per_cpu_info[cpu].len > len of max_load due to race between + * cpu power stats update and get_cpu_pwr_stats(). + */ + max_load = kzalloc(sizeof(struct freq_max_load) + + sizeof(struct freq_max_load_entry) * + (per_cpu_info[cpu].len + 1), GFP_ATOMIC); + if (unlikely(!max_load)) { + ret = -ENOMEM; + goto fail; + } + + max_load->length = per_cpu_info[cpu].len; + + max_demand = max_demand_capacity * + cpu_max_possible_capacity(cpu); + + i = 0; + costs = per_cpu_info[cpu].ptable; + while (costs[i].freq) { + entry = &max_load->freqs[i]; + freq = costs[i].freq; + hpct = get_freq_max_load(cpu, freq); + if (hpct <= 0 && hpct > 100) + hpct = 100; + hfreq = div64_u64((u64)freq * hpct, 100); + entry->hdemand = + div64_u64(max_demand * hfreq, + cpu_max_possible_freq(cpu)); + i++; + } + + rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); + if (old_max_load) + kfree_rcu(old_max_load, rcu); + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return 0; + +fail: + for_each_cpu(cpu, cpumask) { + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (max_load) { + rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); + kfree_rcu(max_load, rcu); + } + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return ret; +} + +#else /* CONFIG_SCHED_HMP */ +u64 sched_ktime_clock(void) +{ + return 0; +} + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) {} +static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) {} +static inline void set_task_last_switch_out(struct task_struct *p, + u64 wallclock) {} +#endif /* CONFIG_SCHED_HMP */ + #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) /* @@ -833,6 +1043,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -841,6 +1052,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -856,6 +1068,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) if (task_contributes_to_load(p)) rq->nr_uninterruptible++; + if (flags & DEQUEUE_SLEEP) + clear_ed_task(p, rq); + dequeue_task(rq, p, flags); } @@ -1047,6 +1262,3004 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq, true); } +#ifdef CONFIG_SCHED_HMP +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_online_cpu(i) { + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +struct list_head cluster_head; +static DEFINE_MUTEX(cluster_lock); +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +static struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .max_mitigated_freq = UINT_MAX, + .min_freq = 1, + .max_possible_freq = 1, + .dstate = 0, + .dstate_wakeup_energy = 0, + .dstate_wakeup_latency = 0, + .exec_scale_factor = 1024, +}; + +void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + + pre_big_task_count_change(cpu_possible_mask); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + sched_update_freq_max_load(cpu_possible_mask); + post_big_task_count_change(cpu_possible_mask); +} + +static void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +static void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +static int +compare_clusters(void *priv, struct list_head *a, struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +static void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + } + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + __WARN_printf("Cluster allocation failed. \ + Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->max_mitigated_freq = UINT_MAX; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->dstate = 0; + cluster->dstate_wakeup_energy = 0; + cluster->dstate_wakeup_latency = 0; + cluster->freq_init_done = false; + + cluster->cpus = *cpus; + cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +#ifdef CONFIG_SMP +static void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} +#endif + +static void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + INIT_LIST_HEAD(&cluster_head); +} + +int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + mutex_lock(&cluster_lock); + if (!cb->get_cpu_cycle_counter) { + mutex_unlock(&cluster_lock); + return -EINVAL; + } + + cpu_cycle_counter_cb = *cb; + use_cycle_counter = true; + mutex_unlock(&cluster_lock); + + return 0; +} + +static int __init set_sched_enable_hmp(char *str) +{ + int enable_hmp = 0; + + get_option(&str, &enable_hmp); + + sched_enable_hmp = !!enable_hmp; + + return 0; +} + +early_param("sched_enable_hmp", set_sched_enable_hmp); + +static inline int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +static inline void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +/* Clear any HMP scheduler related requests pending from or on cpu */ +static inline void clear_hmp_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + clear_boost_kick(cpu); + clear_reserved(cpu); + if (rq->push_task) { + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->push_task) { + clear_reserved(rq->push_cpu); + put_task_struct(rq->push_task); + rq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + +int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) +{ + struct rq *rq = cpu_rq(cpu); + + rq->static_cpu_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cpu_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->static_cpu_pwr_cost; +} + +int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->static_cluster_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cluster_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline int got_boost_kick(void) +{ + return 0; +} + +static inline void clear_boost_kick(int cpu) { } + +static inline void clear_hmp_request(int cpu) { } + +int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + return 0; +} + +#ifdef CONFIG_SMP +static void update_cluster_topology(void) { } +#endif + +#endif /* CONFIG_SCHED_HMP */ + +#define SCHED_MIN_FREQ 1 + +#if defined(CONFIG_SCHED_HMP) + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +/* + * Tasks that are runnable continuously for a period greather than + * EARLY_DETECTION_DURATION can be flagged early as potential + * high load tasks. + */ +#define EARLY_DETECTION_DURATION 9500000 + +static __read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +static __read_mostly unsigned int sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; + +#define SCHED_ACCOUNT_WAIT_TIME 1 + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +unsigned int __read_mostly sysctl_sched_enable_colocation = 1; + +#ifdef CONFIG_SCHED_FREQ_INPUT + +__read_mostly unsigned int sysctl_sched_new_task_windows = 5; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 + +/* + * For increase, send notification if + * freq_required - cur_freq > sysctl_sched_freq_inc_notify + */ +__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ + +/* + * For decrease, send notification if + * cur_freq - freq_required > sysctl_sched_freq_dec_notify + */ +__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ + +static __read_mostly unsigned int sched_io_is_busy; + +__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; + +#endif /* CONFIG_SCHED_FREQ_INPUT */ + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly sched_use_pelt; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = 10000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * Major task runtime. If a task runs for more than sched_major_task_runtime + * in a window, it's considered to be generating majority of workload + * for this window. Prediction could be adjusted for such tasks. + */ +#ifdef CONFIG_SCHED_FREQ_INPUT +__read_mostly unsigned int sched_major_task_runtime = 10000000; +#endif + +static unsigned int sync_cpu; + +static LIST_HEAD(related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &related_thread_groups, list) + +/* + * Demand aggregation for frequency purpose: + * + * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads + * for frequency determination purpose. This aggregation is done per-cluster. + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just rq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +static __read_mostly unsigned int sched_freq_aggregate; +__read_mostly unsigned int sysctl_sched_freq_aggregate; + +#define EXITING_TASK_MARKER 0xdeaddead + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static int __init set_sched_ravg_window(char *str) +{ + get_option(&str, &sched_ravg_window); + + sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW || + sched_ravg_window > MAX_SCHED_RAVG_WINDOW); + + return 0; +} + +early_param("sched_ravg_window", set_sched_ravg_window); + +static inline void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + u32 freq; + + freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); + delta *= rq->cluster->exec_scale_factor; + delta >>= 10; + + return delta; +} + +#ifdef CONFIG_SCHED_FREQ_INPUT + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +/* Does freq_required sufficiently exceed or fall behind cur_freq? */ +static inline int +nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) +{ + int delta = freq_required - cur_freq; + + if (freq_required > cur_freq) + return delta < sysctl_sched_freq_inc_notify; + + delta = -delta; + + return delta < sysctl_sched_freq_dec_notify; +} + +/* Convert busy time to frequency equivalent */ +static inline unsigned int load_to_freq(struct rq *rq, u64 load) +{ + unsigned int freq; + + load = scale_load_to_cpu(load, cpu_of(rq)); + load *= 128; + load = div64_u64(load, max_task_load()); + + freq = load * cpu_max_possible_freq(cpu_of(rq)); + freq /= 128; + + return freq; +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu); + +/* + * Return load from all related group in given cpu. + * Caller must ensure that related_thread_group_lock is held. + */ +static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + + for_each_related_thread_group(grp) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, cpu); + *grp_load += cpu_time->prev_runnable_sum; + if (new_grp_load) + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } +} + +/* + * Return load from all related groups in given frequency domain. + * Caller must ensure that related_thread_group_lock is held. + */ +static void group_load_in_freq_domain(struct cpumask *cpus, + u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + int j; + + for_each_related_thread_group(grp) { + for_each_cpu(j, cpus) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, j); + *grp_load += cpu_time->prev_runnable_sum; + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } + } +} + +/* + * Should scheduler alert governor for changing frequency? + * + * @check_pred - evaluate frequency based on the predictive demand + * @check_groups - add load from all related groups on given cpu + * + * check_groups is set to 1 if a "related" task movement/wakeup is triggering + * the notification check. To avoid "re-aggregation" of demand in such cases, + * we check whether the migrated/woken tasks demand (along with demand from + * existing tasks on the cpu) can be met on target cpu + * + */ + +static int send_notification(struct rq *rq, int check_pred, int check_groups) +{ + unsigned int cur_freq, freq_required; + unsigned long flags; + int rc = 0; + u64 group_load = 0, new_load = 0; + + if (!sched_enable_hmp) + return 0; + + if (check_pred) { + u64 prev = rq->old_busy_time; + u64 predicted = rq->hmp_stats.pred_demands_sum; + + if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) + return 0; + + prev = max(prev, rq->old_estimated_time); + if (prev > predicted) + return 0; + + cur_freq = load_to_freq(rq, prev); + freq_required = load_to_freq(rq, predicted); + + if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) + return 0; + } else { + read_lock(&related_thread_group_lock); + /* + * Protect from concurrent update of rq->prev_runnable_sum and + * group cpu load + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (check_groups) + _group_load_in_cpu(cpu_of(rq), &group_load, NULL); + + new_load = rq->prev_runnable_sum + group_load; + + raw_spin_unlock_irqrestore(&rq->lock, flags); + read_unlock(&related_thread_group_lock); + + cur_freq = load_to_freq(rq, rq->old_busy_time); + freq_required = load_to_freq(rq, new_load); + + if (nearly_same_freq(cur_freq, freq_required)) + return 0; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->notifier_sent) { + rq->notifier_sent = 1; + rc = 1; + trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, + new_load); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +/* Alert governor if there is a need to change frequency */ +void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) +{ + int cpu = cpu_of(rq); + + if (!send_notification(rq, check_pred, check_groups)) + return; + + atomic_notifier_call_chain( + &load_alert_notifier_head, 0, + (void *)(long)cpu); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < sysctl_sched_new_task_windows; +} + +#define INC_STEP 8 +#define DEC_STEP 2 +#define CONSISTENT_THRES 16 +#define INC_STEP_BIG 16 +/* + * bucket_increase - update the count of all buckets + * + * @buckets: array of buckets tracking busy time of a task + * @idx: the index of bucket to be incremented + * + * Each time a complete window finishes, count of bucket that runtime + * falls in (@idx) is incremented. Counts of all other buckets are + * decayed. The rate of increase and decay could be different based + * on current count in the bucket. + */ +static inline void bucket_increase(u8 *buckets, int idx) +{ + int i, step; + + for (i = 0; i < NUM_BUSY_BUCKETS; i++) { + if (idx != i) { + if (buckets[i] > DEC_STEP) + buckets[i] -= DEC_STEP; + else + buckets[i] = 0; + } else { + step = buckets[i] >= CONSISTENT_THRES ? + INC_STEP_BIG : INC_STEP; + if (buckets[i] > U8_MAX - step) + buckets[i] = U8_MAX; + else + buckets[i] += step; + } + } +} + +static inline int busy_to_bucket(u32 normalized_rt) +{ + int bidx; + + bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); + bidx = min(bidx, NUM_BUSY_BUCKETS - 1); + + /* + * Combine lowest two buckets. The lowest frequency falls into + * 2nd bucket and thus keep predicting lowest bucket is not + * useful. + */ + if (!bidx) + bidx++; + + return bidx; +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +#define HEAVY_TASK_SKIP 2 +#define HEAVY_TASK_SKIP_LIMIT 4 +/* + * get_pred_busy - calculate predicted demand for a task on runqueue + * + * @rq: runqueue of task p + * @p: task whose prediction is being updated + * @start: starting bucket. returned prediction should not be lower than + * this bucket. + * @runtime: runtime of the task. returned prediction should not be lower + * than this runtime. + * Note: @start can be derived from @runtime. It's passed in only to + * avoid duplicated calculation in some cases. + * + * A new predicted busy time is returned for task @p based on @runtime + * passed in. The function searches through buckets that represent busy + * time equal to or bigger than @runtime and attempts to find the bucket to + * to use for prediction. Once found, it searches through historical busy + * time and returns the latest that falls into the bucket. If no such busy + * time exists, it returns the medium of that bucket. + */ +static u32 get_pred_busy(struct rq *rq, struct task_struct *p, + int start, u32 runtime) +{ + int i; + u8 *buckets = p->ravg.busy_buckets; + u32 *hist = p->ravg.sum_history; + u32 dmin, dmax; + u64 cur_freq_runtime = 0; + int first = NUM_BUSY_BUCKETS, final, skip_to; + u32 ret = runtime; + + /* skip prediction for new tasks due to lack of history */ + if (unlikely(is_new_task(p))) + goto out; + + /* find minimal bucket index to pick */ + for (i = start; i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i]) { + first = i; + break; + } + } + /* if no higher buckets are filled, predict runtime */ + if (first >= NUM_BUSY_BUCKETS) + goto out; + + /* compute the bucket for prediction */ + final = first; + if (first < HEAVY_TASK_SKIP_LIMIT) { + /* compute runtime at current CPU frequency */ + cur_freq_runtime = mult_frac(runtime, max_possible_efficiency, + rq->cluster->efficiency); + cur_freq_runtime = scale_load_to_freq(cur_freq_runtime, + max_possible_freq, rq->cluster->cur_freq); + /* + * if the task runs for majority of the window, try to + * pick higher buckets. + */ + if (cur_freq_runtime >= sched_major_task_runtime) { + int next = NUM_BUSY_BUCKETS; + /* + * if there is a higher bucket that's consistently + * hit, don't jump beyond that. + */ + for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT && + i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i] > CONSISTENT_THRES) { + next = i; + break; + } + } + skip_to = min(next, start + HEAVY_TASK_SKIP); + /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */ + skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to); + /* don't go below first non-empty bucket, if any */ + final = max(first, skip_to); + } + } + + /* determine demand range for the predicted bucket */ + if (final < 2) { + /* lowest two buckets are combined */ + dmin = 0; + final = 1; + } else { + dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); + } + dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); + + /* + * search through runtime history and return first runtime that falls + * into the range of predicted bucket. + */ + for (i = 0; i < sched_ravg_hist_size; i++) { + if (hist[i] >= dmin && hist[i] < dmax) { + ret = hist[i]; + break; + } + } + /* no historical runtime within bucket found, use average of the bin */ + if (ret < dmin) + ret = (dmin + dmax) / 2; + /* + * when updating in middle of a window, runtime could be higher + * than all recorded history. Always predict at least runtime. + */ + ret = max(runtime, ret); +out: + trace_sched_update_pred_demand(rq, p, runtime, + mult_frac((unsigned int)cur_freq_runtime, 100, + sched_ravg_window), ret); + return ret; +} + +static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) +{ + if (p->ravg.pred_demand >= p->ravg.curr_window) + return p->ravg.pred_demand; + + return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), + p->ravg.curr_window); +} + +/* + * predictive demand of a task is calculated at the window roll-over. + * if the task current window busy time exceeds the predicted + * demand, update it here to reflect the task needs. + */ +void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ + u32 new, old; + + if (is_idle_task(p) || exiting_task(p)) + return; + + if (event != PUT_PREV_TASK && event != TASK_UPDATE && + (!SCHED_FREQ_ACCOUNT_WAIT_TIME || + (event != TASK_MIGRATE && + event != PICK_NEXT_TASK))) + return; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) + return; + } + + new = calc_pred_demand(rq, p); + old = p->ravg.pred_demand; + + if (old >= new) + return; + + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, + p->ravg.demand, + new); + + p->ravg.pred_demand = new; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + int flip_counters = 0; + int prev_sum_reset = 0; + bool new_task; + struct related_thread_group *grp; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + grp = p->grp; + if (grp && sched_freq_aggregate) { + /* cpu_time protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu_of(rq)); + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (cpu_time->window_start != rq->window_start) { + int nr_windows; + + delta = rq->window_start - cpu_time->window_start; + nr_windows = div64_u64(delta, window_size); + if (nr_windows > 1) + prev_sum_reset = 1; + + cpu_time->window_start = rq->window_start; + flip_counters = 1; + } + + if (p_is_curr_task && new_window) { + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (full_window) + curr_sum = nt_curr_sum = 0; + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + } + } else { + if (p_is_curr_task && new_window) { + flip_counters = 1; + if (full_window) + prev_sum_reset = 1; + } + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!full_window) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (flip_counters) { + u64 curr_sum = *curr_runnable_sum; + u64 nt_curr_sum = *nt_curr_runnable_sum; + + if (prev_sum_reset) + curr_sum = nt_curr_sum = 0; + + *prev_runnable_sum = curr_sum; + *nt_prev_runnable_sum = nt_curr_sum; + + *curr_runnable_sum = 0; + *nt_curr_runnable_sum = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + /* p is idle task */ + BUG_ON(p != rq->idle); + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!full_window) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!full_window) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + } + + /* Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static inline u32 predict_and_update_buckets(struct rq *rq, + struct task_struct *p, u32 runtime) { + + int bidx; + u32 pred_demand; + + bidx = busy_to_bucket(runtime); + pred_demand = get_pred_busy(rq, p, bidx, runtime); + bucket_increase(p->ravg.busy_buckets, bidx); + + return pred_demand; +} +#define assign_ravg_pred_demand(x) (p->ravg.pred_demand = x) + +#else /* CONFIG_SCHED_FREQ_INPUT */ + +static inline void +update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ +} + +static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ +} + +static inline u32 predict_and_update_buckets(struct rq *rq, + struct task_struct *p, u32 runtime) +{ + return 0; +} +#define assign_ravg_pred_demand(x) + +#endif /* CONFIG_SCHED_FREQ_INPUT */ + +static void update_task_cpu_cycles(struct task_struct *p, int cpu) +{ + if (use_cycle_counter) + p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); +} + +static void +update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 cur_cycles; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + if (!use_cycle_counter) { + rq->cc.cycles = cpu_cur_freq(cpu); + rq->cc.time = 1; + return; + } + + cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + + /* + * If current task is idle task and irqtime == 0 CPU was + * indeed idle and probably its cycle counter was not + * increasing. We still need estimatied CPU frequency + * for IO wait time accounting. Use the previously + * calculated frequency in such a case. + */ + if (!is_idle_task(rq->curr) || irqtime) { + if (unlikely(cur_cycles < p->cpu_cycles)) + rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); + else + rq->cc.cycles = cur_cycles - p->cpu_cycles; + rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; + + if (event == IRQ_UPDATE && is_idle_task(p)) + /* + * Time between mark_start of idle task and IRQ handler + * entry time is CPU cycle counter stall period. + * Upon IRQ handler entry sched_account_irqstart() + * replenishes idle task's cpu cycle counter so + * rq->cc.cycles now represents increased cycles during + * IRQ handler rather than time between idle entry and + * IRQ exit. Thus use irqtime as time delta. + */ + rq->cc.time = irqtime; + else + rq->cc.time = wallclock - p->ravg.mark_start; + BUG_ON((s64)rq->cc.time < 0); + } + + p->cpu_cycles = cur_cycles; + + trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand, pred_demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + pred_demand = predict_and_update_buckets(rq, p, runtime); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, demand, + pred_demand); + + p->ravg.demand = demand; + assign_ravg_pred_demand(pred_demand); + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +static void +update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + if (sched_use_pelt || !rq->window_start || sched_disable_window_stats) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) { + update_task_cpu_cycles(p, cpu_of(rq)); + goto done; + } + + update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + update_task_pred_demand(rq, p, event); +done: + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, + rq->cc.cycles, rq->cc.time, + _group_cpu_time(p->grp, cpu_of(rq))); + + p->ravg.mark_start = wallclock; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + + if (!rq->window_start || sched_disable_window_stats) + return; + + if (is_idle_task(curr)) { + /* We're here without rq->lock held, IRQ disabled */ + raw_spin_lock(&rq->lock); + update_task_cpu_cycles(curr, cpu); + raw_spin_unlock(&rq->lock); + } +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +static inline void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = p->last_wake_ts = wallclock; + p->last_cpu_selected_ts = wallclock; + p->last_switch_out_ts = 0; + update_task_cpu_cycles(p, cpu_of(rq)); +} + +static inline void set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start || !sched_enable_hmp) + return; + + if (cpu == sync_cpu) { + rq->window_start = sched_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; +#ifdef CONFIG_SCHED_FREQ_INPUT + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; +#endif + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +static inline void migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +static void reset_all_task_stats(void) +{ + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + reset_task_stats(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +} + +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + unsigned long flags; + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + u64 wallclock; + + sched_set_group_id(p, 0); + + raw_spin_lock_irqsave(&rq->lock, flags); + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + enqueue_task(rq, p, 0); + clear_ed_task(p, rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + put_cpu(); +} + +static void disable_window_stats(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + sched_disable_window_stats = 1; + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_restore(flags); +} + +/* Called with all cpu's rq->lock held */ +static void enable_window_stats(void) +{ + sched_disable_window_stats = 0; + +} + +enum reset_reason_code { + WINDOW_CHANGE, + POLICY_CHANGE, + HIST_SIZE_CHANGE, + FREQ_AGGREGATE_CHANGE, +}; + +const char *sched_window_reset_reasons[] = { + "WINDOW_CHANGE", + "POLICY_CHANGE", + "HIST_SIZE_CHANGE", +}; + +/* Called with IRQs enabled */ +void reset_all_window_stats(u64 window_start, unsigned int window_size) +{ + int cpu; + unsigned long flags; + u64 start_ts = sched_ktime_clock(); + int reason = WINDOW_CHANGE; + unsigned int old = 0, new = 0; + struct related_thread_group *grp; + + disable_window_stats(); + + reset_all_task_stats(); + + local_irq_save(flags); + + read_lock(&related_thread_group_lock); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); + } + + list_for_each_entry(grp, &related_thread_groups, list) { + int j; + + for_each_possible_cpu(j) { + struct group_cpu_time *cpu_time; + /* Protected by rq lock */ + cpu_time = _group_cpu_time(grp, j); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + if (window_start) + cpu_time->window_start = window_start; + } + } + + if (window_size) { + sched_ravg_window = window_size * TICK_NSEC; + set_hmp_defaults(); + } + + enable_window_stats(); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + if (window_start) + rq->window_start = window_start; +#ifdef CONFIG_SCHED_FREQ_INPUT + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; +#endif + reset_cpu_hmp_stats(cpu, 1); + } + + if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { + reason = POLICY_CHANGE; + old = sched_window_stats_policy; + new = sysctl_sched_window_stats_policy; + sched_window_stats_policy = sysctl_sched_window_stats_policy; + } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { + reason = HIST_SIZE_CHANGE; + old = sched_ravg_hist_size; + new = sysctl_sched_ravg_hist_size; + sched_ravg_hist_size = sysctl_sched_ravg_hist_size; + } +#ifdef CONFIG_SCHED_FREQ_INPUT + else if (sched_freq_aggregate != + sysctl_sched_freq_aggregate) { + reason = FREQ_AGGREGATE_CHANGE; + old = sched_freq_aggregate; + new = sysctl_sched_freq_aggregate; + sched_freq_aggregate = sysctl_sched_freq_aggregate; + } +#endif + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + raw_spin_unlock(&rq->lock); + } + + read_unlock(&related_thread_group_lock); + + local_irq_restore(flags); + + trace_sched_reset_all_window_stats(window_start, window_size, + sched_ktime_clock() - start_ts, reason, old, new); +} + +#ifdef CONFIG_SCHED_FREQ_INPUT + +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); + +void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) +{ + unsigned long flags; + struct rq *rq; + const int cpus = cpumask_weight(query_cpus); + u64 load[cpus], group_load[cpus]; + u64 nload[cpus], ngload[cpus]; + u64 pload[cpus]; + unsigned int cur_freq[cpus], max_freq[cpus]; + int notifier_sent[cpus]; + int early_detection[cpus]; + int cpu, i = 0; + unsigned int window_size; + u64 max_prev_sum = 0; + int max_busy_cpu = cpumask_first(query_cpus); + struct related_thread_group *grp; + + if (unlikely(cpus == 0)) + return; + + /* + * This function could be called in timer context, and the + * current task may have been executing for a long time. Ensure + * that the window stats are current by doing an update. + */ + read_lock(&related_thread_group_lock); + + local_irq_save(flags); + for_each_cpu(cpu, query_cpus) + raw_spin_lock(&cpu_rq(cpu)->lock); + + window_size = sched_ravg_window; + + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), + 0); + cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + + load[i] = rq->old_busy_time = rq->prev_runnable_sum; + nload[i] = rq->nt_prev_runnable_sum; + pload[i] = rq->hmp_stats.pred_demands_sum; + rq->old_estimated_time = pload[i]; + + if (load[i] > max_prev_sum) { + max_prev_sum = load[i]; + max_busy_cpu = cpu; + } + + notifier_sent[i] = rq->notifier_sent; + early_detection[i] = (rq->ed_task != NULL); + rq->notifier_sent = 0; + cur_freq[i] = cpu_cur_freq(cpu); + max_freq[i] = cpu_max_freq(cpu); + i++; + } + + for_each_related_thread_group(grp) { + for_each_cpu(cpu, query_cpus) { + /* Protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu); + sync_window_start(cpu_rq(cpu), cpu_time); + } + } + + i = 0; + for_each_cpu(cpu, query_cpus) { + group_load[i] = 0; + ngload[i] = 0; + + if (early_detection[i]) + goto skip_early; + + rq = cpu_rq(cpu); + if (!notifier_sent[i]) { + if (cpu == max_busy_cpu) + group_load_in_freq_domain( + &rq->freq_domain_cpumask, + &group_load[i], &ngload[i]); + } else { + _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); + } + + load[i] += group_load[i]; + nload[i] += ngload[i]; + /* + * Scale load in reference to cluster max_possible_freq. + * + * Note that scale_load_to_cpu() scales load in reference to + * the cluster max_freq. + */ + load[i] = scale_load_to_cpu(load[i], cpu); + nload[i] = scale_load_to_cpu(nload[i], cpu); + pload[i] = scale_load_to_cpu(pload[i], cpu); +skip_early: + i++; + } + + for_each_cpu(cpu, query_cpus) + raw_spin_unlock(&(cpu_rq(cpu))->lock); + local_irq_restore(flags); + + read_unlock(&related_thread_group_lock); + + i = 0; + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + if (early_detection[i]) { + busy[i].prev_load = div64_u64(sched_ravg_window, + NSEC_PER_USEC); + busy[i].new_task_load = 0; + goto exit_early; + } + + if (!notifier_sent[i]) { + load[i] = scale_load_to_freq(load[i], max_freq[i], + cur_freq[i]); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cur_freq[i]); + if (load[i] > window_size) + load[i] = window_size; + if (nload[i] > window_size) + nload[i] = window_size; + + load[i] = scale_load_to_freq(load[i], cur_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], cur_freq[i], + cpu_max_possible_freq(cpu)); + } else { + load[i] = scale_load_to_freq(load[i], max_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cpu_max_possible_freq(cpu)); + } + pload[i] = scale_load_to_freq(pload[i], max_freq[i], + rq->cluster->max_possible_freq); + + busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); + busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); + busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); + +exit_early: + trace_sched_get_busy(cpu, busy[i].prev_load, + busy[i].new_task_load, + busy[i].predicted_load, + early_detection[i]); + i++; + } +} + +void sched_set_io_is_busy(int val) +{ + sched_io_is_busy = val; +} + +int sched_set_window(u64 window_start, unsigned int window_size) +{ + u64 now, cur_jiffies, jiffy_ktime_ns; + s64 ws; + unsigned long flags; + + if (sched_use_pelt || + (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)) + return -EINVAL; + + mutex_lock(&policy_mutex); + + /* + * Get a consistent view of ktime, jiffies, and the time + * since the last jiffy (based on last_jiffies_update). + */ + local_irq_save(flags); + cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); + local_irq_restore(flags); + + /* translate window_start from jiffies to nanoseconds */ + ws = (window_start - cur_jiffies); /* jiffy difference */ + ws *= TICK_NSEC; + ws += jiffy_ktime_ns; + + /* roll back calculated window start so that it is in + * the past (window stats must have a current window) */ + while (ws > now) + ws -= (window_size * TICK_NSEC); + + BUG_ON(sched_ktime_clock() < ws); + + reset_all_window_stats(ws, window_size); + + sched_update_freq_max_load(cpu_possible_mask); + + mutex_unlock(&policy_mutex); + + return 0; +} + +static void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + struct migration_sum_data d; + bool new_task; + struct related_thread_group *grp; + + if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING)) + return; + + if (exiting_task(p)) { + clear_ed_task(p, src_rq); + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + update_task_cpu_cycles(p, new_cpu); + + new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time; + + migrate_type = GROUP_TO_GROUP; + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + sync_window_start(dest_rq, cpu_time); + } else { + migrate_type = RQ_TO_RQ; + d.src_rq = src_rq; + d.src_cpu_time = NULL; + d.dst_rq = dest_rq; + d.dst_cpu_time = NULL; + src_curr_runnable_sum = &src_rq->curr_runnable_sum; + src_prev_runnable_sum = &src_rq->prev_runnable_sum; + src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; + + dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; + dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; + dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; + } + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + } + } + + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + } + + if (p == src_rq->ed_task) { + src_rq->ed_task = NULL; + if (!dest_rq->ed_task) + dest_rq->ed_task = p; + } + + trace_sched_migration_update_sum(p, migrate_type, &d); + BUG_ON((s64)*src_prev_runnable_sum < 0); + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_nt_prev_runnable_sum < 0); + BUG_ON((s64)*src_nt_curr_runnable_sum < 0); + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +#else + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } + +#endif /* CONFIG_SCHED_FREQ_INPUT */ + +#define sched_up_down_migrate_auto_update 1 +static void check_for_up_down_migrate_update(const struct cpumask *cpus) +{ + int i = cpumask_first(cpus); + + if (!sched_up_down_migrate_auto_update) + return; + + if (cpu_max_possible_capacity(i) == max_possible_capacity) + return; + + if (cpu_max_possible_freq(i) == cpu_max_freq(i)) + up_down_migrate_scale_factor = 1024; + else + up_down_migrate_scale_factor = (1024 * + cpu_max_possible_freq(i)) / cpu_max_freq(i); + + update_up_down_migrate(); +} + +/* Return cluster which can offer required capacity for group */ +static struct sched_cluster * +best_cluster(struct related_thread_group *grp, u64 total_demand) +{ + struct sched_cluster *cluster = NULL; + + for_each_sched_cluster(cluster) { + if (group_will_fit(cluster, grp, total_demand)) + return cluster; + } + + return NULL; +} + +static void _set_preferred_cluster(struct related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + + if (!sysctl_sched_enable_colocation) { + grp->last_update = sched_ktime_clock(); + grp->preferred_cluster = NULL; + return; + } + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each_entry(p, &grp->tasks, grp_list) + combined_demand += p->ravg.demand; + + grp->preferred_cluster = best_cluster(grp, combined_demand); + grp->last_update = sched_ktime_clock(); + trace_sched_set_preferred_cluster(grp, combined_demand); +} + +static void set_preferred_cluster(struct related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +#define ADD_TASK 0 +#define REM_TASK 1 + +#ifdef CONFIG_SCHED_FREQ_INPUT + +static void +update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime); + +static inline void free_group_cputime(struct related_thread_group *grp) +{ + free_percpu(grp->cpu_time); +} + +static int alloc_group_cputime(struct related_thread_group *grp) +{ + int i; + struct group_cpu_time *cpu_time; + int cpu = raw_smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + u64 window_start = rq->window_start; + + grp->cpu_time = alloc_percpu(struct group_cpu_time); + if (!grp->cpu_time) + return -ENOMEM; + + for_each_possible_cpu(i) { + cpu_time = per_cpu_ptr(grp->cpu_time, i); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + cpu_time->window_start = window_start; + } + + return 0; +} + +/* + * A group's window_start may be behind. When moving it forward, flip prev/curr + * counters. When moving forward > 1 window, prev counter is set to 0 + */ +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) +{ + u64 delta; + int nr_windows; + u64 curr_sum = cpu_time->curr_runnable_sum; + u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; + + delta = rq->window_start - cpu_time->window_start; + if (!delta) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + if (nr_windows > 1) + curr_sum = nt_curr_sum = 0; + + cpu_time->prev_runnable_sum = curr_sum; + cpu_time->curr_runnable_sum = 0; + + cpu_time->nt_prev_runnable_sum = nt_curr_sum; + cpu_time->nt_curr_runnable_sum = 0; + + cpu_time->window_start = rq->window_start; +} + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + struct migration_sum_data d; + int migrate_type; + + if (!sched_freq_aggregate) + return; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + + /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(rq)); + if (event == ADD_TASK) { + sync_window_start(rq, cpu_time); + migrate_type = RQ_TO_GROUP; + d.src_rq = rq; + d.src_cpu_time = NULL; + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } else if (event == REM_TASK) { + migrate_type = GROUP_TO_RQ; + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + d.dst_rq = rq; + d.dst_cpu_time = NULL; + + /* + * In case of REM_TASK, cpu_time->window_start would be + * uptodate, because of the update_task_ravg() we called + * above on the moving task. Hence no need for + * sync_window_start() + */ + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + } + + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + + if (is_new_task(p)) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + trace_sched_migration_update_sum(p, migrate_type, &d); + + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_prev_runnable_sum < 0); +} + +static inline struct group_cpu_time * +task_group_cpu_time(struct task_struct *p, int cpu) +{ + return _group_cpu_time(rcu_dereference(p->grp), cpu); +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu) +{ + return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; +} + +#else /* CONFIG_SCHED_FREQ_INPUT */ + +static inline void free_group_cputime(struct related_thread_group *grp) { } + +static inline int alloc_group_cputime(struct related_thread_group *grp) +{ + return 0; +} + +static inline void transfer_busy_time(struct rq *rq, + struct related_thread_group *grp, struct task_struct *p, int event) +{ +} + +static struct group_cpu_time * +task_group_cpu_time(struct task_struct *p, int cpu) +{ + return NULL; +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu) +{ + return NULL; +} + +#endif + +struct related_thread_group *alloc_related_thread_group(int group_id) +{ + struct related_thread_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + return ERR_PTR(-ENOMEM); + + if (alloc_group_cputime(grp)) { + kfree(grp); + return ERR_PTR(-ENOMEM); + } + + grp->id = group_id; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + return grp; +} + +struct related_thread_group *lookup_related_thread_group(unsigned int group_id) +{ + struct related_thread_group *grp; + + list_for_each_entry(grp, &related_thread_groups, list) { + if (grp->id == group_id) + return grp; + } + + return NULL; +} + +/* See comments before preferred_cluster() */ +static void free_related_thread_group(struct rcu_head *rcu) +{ + struct related_thread_group *grp = container_of(rcu, struct + related_thread_group, rcu); + + free_group_cputime(grp); + kfree(grp); +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq; + int empty_group = 1; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p); + transfer_busy_time(rq, p->grp, p, REM_TASK); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + __task_rq_unlock(rq); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + if (empty_group) { + list_del(&grp->list); + call_rcu(&grp->rcu, free_related_thread_group); + } +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq; + + raw_spin_lock(&grp->lock); + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p); + transfer_busy_time(rq, grp, p, ADD_TASK); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + __task_rq_unlock(rq); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0, destroy = 0; + unsigned long flags; + struct related_thread_group *grp = NULL, *new = NULL; + +redo: + raw_spin_lock_irqsave(&p->pi_lock, flags); + + if ((current != p && p->flags & PF_EXITING) || + (!p->grp && !group_id) || + (p->grp && p->grp->id == group_id)) + goto done; + + write_lock(&related_thread_group_lock); + + if (!group_id) { + remove_task_from_group(p); + write_unlock(&related_thread_group_lock); + goto done; + } + + if (p->grp && p->grp->id != group_id) + remove_task_from_group(p); + + grp = lookup_related_thread_group(group_id); + if (!grp && !new) { + /* New group */ + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + new = alloc_related_thread_group(group_id); + if (IS_ERR(new)) + return -ENOMEM; + destroy = 1; + /* Rerun checks (like task exiting), since we dropped pi_lock */ + goto redo; + } else if (!grp && new) { + /* New group - use object allocated before */ + destroy = 0; + list_add(&new->list, &related_thread_groups); + grp = new; + } + + BUG_ON(!grp); + rc = add_task_to_group(p, grp); + write_unlock(&related_thread_group_lock); +done: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + if (new && destroy) { + free_group_cputime(new); + kfree(new); + } + + return rc; +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +static void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + + cpumask_copy(&cpumask, cpus); + pre_big_task_count_change(cpu_possible_mask); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + /* 'cpus' can contain cpumask more than one cluster */ + check_for_up_down_migrate_update(&cluster->cpus); + } + + __update_min_max_capacity(); + + post_big_task_count_change(cpu_possible_mask); +} + +static DEFINE_SPINLOCK(cpu_freq_min_max_lock); +void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) +{ + struct cpumask cpumask; + struct sched_cluster *cluster; + int i, update_capacity = 0; + unsigned long flags; + + spin_lock_irqsave(&cpu_freq_min_max_lock, flags); + cpumask_copy(&cpumask, cpus); + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + update_capacity += (cluster->max_mitigated_freq != fmax); + cluster->max_mitigated_freq = fmax; + } + spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); + + if (update_capacity) + update_cpu_cluster_capacity(cpus); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_cur_freq(cpu) == new_freq) + return 0; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return 0; +} + +static int pwr_stats_ready_notifier(struct notifier_block *nb, + unsigned long cpu, void *data) +{ + cpumask_t mask = CPU_MASK_NONE; + + cpumask_set_cpu(cpu, &mask); + sched_update_freq_max_load(&mask); + + mutex_lock(&cluster_lock); + sort_clusters(); + mutex_unlock(&cluster_lock); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static struct notifier_block notifier_pwr_stats_ready = { + .notifier_call = pwr_stats_ready_notifier +}; + +int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static int register_sched_callback(void) +{ + int ret; + + if (!sched_enable_hmp) + return 0; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } + +static void +update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ +} + +static inline void mark_task_starting(struct task_struct *p) {} + +static inline void set_window_start(struct rq *rq) {} + +static inline void migrate_sync_cpu(int cpu) {} + +#endif /* CONFIG_SCHED_HMP */ + #ifdef CONFIG_SMP /* * This is how migration works: @@ -1071,17 +4284,19 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1103,6 +4318,8 @@ struct migration_arg { */ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) { + int src_cpu; + if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -1110,11 +4327,40 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return rq; + src_cpu = cpu_of(rq); rq = move_queued_task(rq, p, dest_cpu); return rq; } +static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, + struct task_struct *p) +{ + struct migration_notify_data mnd; + bool check_groups; + + rcu_read_lock(); + check_groups = rcu_access_pointer(p->grp) != NULL; + rcu_read_unlock(); + + if (!same_freq_domain(src_cpu, dest_cpu)) { + if (!src_cpu_dead) + check_for_freq_change(cpu_rq(src_cpu), false, + check_groups); + check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); + } else { + check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); + } + + if (task_notify_on_migrate(p)) { + mnd.src_cpu = src_cpu; + mnd.dest_cpu = dest_cpu; + mnd.load = pct_task_load(p); + atomic_notifier_call_chain(&migration_notifier_head, 0, + (void *)&mnd); + } +} + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -1125,6 +4371,8 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + int src_cpu = cpu_of(rq); + bool moved = false; /* * The original target cpu might have gone down and we might @@ -1145,12 +4393,18 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) + if (task_rq(p) == rq && task_on_rq_queued(p)) { rq = __migrate_task(rq, p, arg->dest_cpu); + moved = true; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); local_irq_enable(); + + if (moved) + notify_migration(src_cpu, arg->dest_cpu, false, p); + return 0; } @@ -1224,7 +4478,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1235,7 +4490,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -1274,6 +4528,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1290,13 +4553,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif #endif - trace_sched_migrate_task(p, new_cpu); + trace_sched_migrate_task(p, new_cpu, pct_task_load(p)); if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); + + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1310,9 +4575,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1498,7 +4765,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(queued)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + ktime_t to = ktime_set(0, NSEC_PER_MSEC); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&to, HRTIMER_MODE_REL); @@ -1717,6 +4984,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); + p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -1808,6 +5076,8 @@ void sched_ttwu_pending(void) void scheduler_ipi(void) { + int cpu = smp_processor_id(); + /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send @@ -1815,9 +5085,18 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() && + !got_boost_kick()) return; + if (got_boost_kick()) { + struct rq *rq = cpu_rq(cpu); + + if (rq->curr->sched_class == &fair_sched_class) + check_for_migration(rq, rq->curr); + clear_boost_kick(cpu); + } + /* * Not all reschedule IPI handlers call irq_enter/irq_exit, since * traditionally all their work was done from the interrupt return @@ -1905,6 +5184,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) raw_spin_unlock(&rq->lock); } +__read_mostly unsigned int sysctl_sched_wakeup_load_threshold = 110; + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -1924,7 +5205,19 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; - int cpu, success = 0; + int cpu, src_cpu, success = 0; + int notify = 0; + struct migration_notify_data mnd; +#ifdef CONFIG_SMP + unsigned int old_load; + struct rq *rq; + u64 wallclock; + struct related_thread_group *grp = NULL; +#endif + bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER); + bool check_group = false; + + wake_flags &= ~WF_NO_NOTIFIER; /* * If we are going to wake up a thread waiting for CONDITION we @@ -1934,13 +5227,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + src_cpu = cpu = task_cpu(p); + if (!(p->state & state)) goto out; trace_sched_waking(p); success = 1; /* we're going to change ->state */ - cpu = task_cpu(p); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -1982,6 +5276,22 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + old_load = task_load(p); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (update_preferred_cluster(grp, p, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + check_group = grp != NULL; + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -1989,18 +5299,55 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) { + + /* Refresh src_cpu as it could have changed since we last read it */ + src_cpu = task_cpu(p); + if (src_cpu != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } -#endif /* CONFIG_SMP */ + set_task_last_wake(p, wallclock); +#endif /* CONFIG_SMP */ ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); + + if (task_notify_on_migrate(p)) { + mnd.src_cpu = src_cpu; + mnd.dest_cpu = cpu; + mnd.load = pct_task_load(p); + + /* + * Call the migration notifier with mnd for foreground task + * migrations as well as for wakeups if their load is above + * sysctl_sched_wakeup_load_threshold. This would prompt the + * cpu-boost to boost the CPU frequency on wake up of a heavy + * weight foreground task + */ + if ((src_cpu != cpu) || (mnd.load > + sysctl_sched_wakeup_load_threshold)) + notify = 1; + } + out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (notify) + atomic_notifier_call_chain(&migration_notifier_head, + 0, (void *)&mnd); + + if (freq_notif_allowed) { + if (!same_freq_domain(src_cpu, cpu)) { + check_for_freq_change(cpu_rq(cpu), + false, check_group); + check_for_freq_change(cpu_rq(src_cpu), + false, check_group); + } else if (success) { + check_for_freq_change(cpu_rq(cpu), true, false); + } + } + return success; } @@ -2016,9 +5363,13 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) + if (rq != this_rq() || p == current) { + printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p," + " this_rq = %p, p = %p, current = %p\n", + __func__, task_pid_nr(p), p->comm, rq, + this_rq(), p, current); return; + } lockdep_assert_held(&rq->lock); @@ -2041,13 +5392,20 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + set_task_last_wake(p, wallclock); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); + /* Todo : Send cpufreq notifier */ } /** @@ -2068,6 +5426,26 @@ int wake_up_process(struct task_struct *p) } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_process_no_notif - Wake up a specific process without notifying + * governor + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process_no_notif(struct task_struct *p) +{ + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER); +} +EXPORT_SYMBOL(wake_up_process_no_notif); + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -2107,6 +5485,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -2375,6 +5754,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + init_new_task_load(p); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2387,6 +5767,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); + mark_task_starting(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2514,6 +5895,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + +#ifdef CONFIG_MSM_APP_SETTINGS + if (use_app_setting) + switch_app_setting_bit(prev, next); +#endif } /** @@ -2775,7 +6161,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) *load = rq->load.weight; } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) /* * sched_exec - execve() is a valuable balancing opportunity, because at @@ -2785,9 +6171,13 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - int dest_cpu; + int dest_cpu, curr_cpu; + + if (sched_enable_hmp) + return; raw_spin_lock_irqsave(&p->pi_lock, flags); + curr_cpu = task_cpu(p); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2796,7 +6186,7 @@ void sched_exec(void) struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); return; } unlock: @@ -2854,6 +6244,37 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_SCHED_HMP +static bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + struct task_struct *p; + int loop_max = 10; + + if (!sched_boost() || !rq->cfs.h_nr_running) + return 0; + + rq->ed_task = NULL; + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + if (!loop_max) + break; + + if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { + rq->ed_task = p; + return 1; + } + + loop_max--; + } + + return 0; +} +#else /* CONFIG_SCHED_HMP */ +static bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + return 0; +} +#endif /* CONFIG_SCHED_HMP */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2863,16 +6284,29 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + u64 wallclock; + bool early_notif; + u32 old_load; + struct related_thread_group *grp; sched_clock_tick(); raw_spin_lock(&rq->lock); + old_load = task_load(curr); + set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + early_notif = early_detection_notify(rq, wallclock); raw_spin_unlock(&rq->lock); + if (early_notif) + atomic_notifier_call_chain(&load_alert_notifier_head, + 0, (void *)(long)cpu); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -2880,6 +6314,15 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + rcu_read_lock(); + grp = task_related_thread_group(curr); + if (update_preferred_cluster(grp, curr, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + + if (curr->sched_class == &fair_sched_class) + check_for_migration(rq, curr); } #ifdef CONFIG_NO_HZ_FULL @@ -2998,6 +6441,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -3106,6 +6552,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3167,15 +6614,22 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = sched_ktime_clock(); + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; + BUG_ON(task_cpu(next) != cpu_of(rq)); + if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; + set_task_last_switch_out(prev, wallclock); + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); @@ -4081,7 +7535,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +EXPORT_SYMBOL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -4958,7 +8412,7 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); -#ifdef CONFIG_SCHED_DEBUG +#ifdef CONFIG_SYSRQ_SCHED_DEBUG sysrq_sched_debug_show(); #endif rcu_read_unlock(); @@ -4987,10 +8441,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -5292,8 +8747,11 @@ static void migrate_tasks(struct rq *dead_rq) rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { + raw_spin_unlock(&next->pi_lock); raw_spin_unlock(&rq->lock); + notify_migration(dead_rq->cpu, dest_cpu, true, next); rq = dead_rq; + raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); } raw_spin_unlock(&next->pi_lock); @@ -5524,6 +8982,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; account_reset_rq(rq); break; @@ -5544,6 +9005,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + migrate_sync_cpu(cpu); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -5554,6 +9017,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DEAD: + clear_hmp_request(cpu); calc_load_migrate(rq); break; #endif @@ -6028,6 +9492,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + unsigned long next_balance = rq->next_balance; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -6059,6 +9524,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; ) { + unsigned long interval; + + interval = msecs_to_jiffies(tmp->balance_interval); + if (time_after(next_balance, tmp->last_balance + interval)) + next_balance = tmp->last_balance + interval; + + tmp = tmp->parent; + } + rq->next_balance = next_balance; + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -6950,6 +10426,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); +#endif /* Fixup, ensure @sd has at least @child cpus. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -7324,6 +10803,8 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + update_cluster_topology(); + init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -7342,6 +10823,7 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7365,6 +10847,15 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; + if (sched_enable_hmp) + pr_info("HMP scheduling enabled.\n"); + + BUG_ON(num_possible_cpus() > BITS_PER_LONG); + +#ifdef CONFIG_SCHED_HMP + init_clusters(); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7475,11 +10966,43 @@ void __init sched_init(void) rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; + rq->push_task = NULL; rq->cpu = i; rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_HMP + cpumask_set_cpu(i, &rq->freq_domain_cpumask); + rq->hmp_stats.cumulative_runnable_avg = 0; + rq->window_start = 0; + rq->hmp_stats.nr_big_tasks = 0; + rq->hmp_flags = 0; + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; + rq->static_cpu_pwr_cost = 0; + rq->cc.cycles = SCHED_MIN_FREQ; + rq->cc.time = 1; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; +#ifdef CONFIG_SCHED_FREQ_INPUT + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + rq->old_busy_time = 0; + rq->old_estimated_time = 0; + rq->old_busy_time_group = 0; + rq->notifier_sent = 0; + rq->hmp_stats.pred_demands_sum = 0; +#endif +#endif rq->max_idle_balance_cost = sysctl_sched_migration_cost; + rq->cstate = 0; + rq->wakeup_latency = 0; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7495,6 +11018,8 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + set_hmp_defaults(); + set_load_weight(&init_task); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -7543,6 +11068,14 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == preempt_offset); } +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + void __might_sleep(const char *file, int line, int preempt_offset) { /* @@ -7567,8 +11100,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + !is_idle_task(current)) || oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -7595,6 +11130,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) pr_cont("\n"); } #endif +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); +#endif dump_stack(); } EXPORT_SYMBOL(___might_sleep); @@ -8269,6 +11807,63 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } +static u64 cpu_notify_on_migrate_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->notify_on_migrate; +} + +static int cpu_notify_on_migrate_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 notify) +{ + struct task_group *tg = css_tg(css); + + tg->notify_on_migrate = (notify > 0); + + return 0; +} + +#ifdef CONFIG_SCHED_HMP + +static u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->upmigrate_discouraged; +} + +static int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage) +{ + struct task_group *tg = css_tg(css); + int discourage = upmigrate_discourage > 0; + + if (tg->upmigrate_discouraged == discourage) + return 0; + + /* + * Revisit big-task classification for tasks of this cgroup. It would + * have been efficient to walk tasks of just this cgroup in running + * state, but we don't have easy means to do that. Walk all tasks in + * running state on all cpus instead and re-visit their big task + * classification. + */ + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + + tg->upmigrate_discouraged = discourage; + + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8554,6 +12149,18 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { + { + .name = "notify_on_migrate", + .read_u64 = cpu_notify_on_migrate_read_u64, + .write_u64 = cpu_notify_on_migrate_write_u64, + }, +#ifdef CONFIG_SCHED_HMP + { + .name = "upmigrate_discourage", + .read_u64 = cpu_upmigrate_discourage_read_u64, + .write_u64 = cpu_upmigrate_discourage_write_u64, + }, +#endif #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -8600,6 +12207,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f74ea89e77a8..f29b132a9f8b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,6 +49,8 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; + u64 wallclock; + bool account = true; if (!sched_clock_irqtime) return; @@ -56,7 +58,8 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + wallclock = sched_clock_cpu(cpu); + delta = wallclock - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); irq_time_write_begin(); @@ -70,8 +73,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); + else + account = false; irq_time_write_end(); + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); + else if (curr != this_cpu_ksoftirqd()) + sched_account_irqstart(cpu, curr, wallclock); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..44178fea87d0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -271,9 +271,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * By now the task is replenished and enqueued; migrate it. */ + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); activate_task(later_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; if (!fallback) resched_curr(later_rq); @@ -851,6 +853,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -860,6 +897,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -874,6 +912,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -1555,9 +1594,11 @@ retry: goto retry; } + next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; resched_curr(later_rq); @@ -1643,9 +1684,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; dmin = p->dl.deadline; /* Is there any other task even earlier? */ @@ -1846,6 +1889,11 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_dl, + .dec_hmp_sched_stats = dec_hmp_sched_stats_dl, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..d1c0ef4bf07d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", cfs_rq->throttle_count); + SEQ_printf(m, " .%-30s: %d\n", "runtime_enabled", + cfs_rq->runtime_enabled); +#ifdef CONFIG_SCHED_HMP + SEQ_printf(m, " .%-30s: %d\n", "nr_big_tasks", + cfs_rq->hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "cumulative_runnable_avg", + cfs_rq->hmp_stats.cumulative_runnable_avg); +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -306,6 +314,25 @@ do { \ P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_SMP + P(cpu_capacity); +#endif +#ifdef CONFIG_SCHED_HMP + P(static_cpu_pwr_cost); + P(cluster->static_cluster_pwr_cost); + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); +#endif +#ifdef CONFIG_SCHED_HMP + P(hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg", + rq->hmp_stats.cumulative_runnable_avg); +#endif #undef P #undef PN @@ -386,6 +413,16 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_HMP + P(sched_upmigrate); + P(sched_downmigrate); + P(sched_init_task_load_windows); + P(sched_init_task_load_pelt); + P(min_capacity); + P(max_capacity); + P(sched_use_pelt); + P(sched_ravg_window); +#endif #undef PN #undef P @@ -408,6 +445,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_SYSRQ_SCHED_DEBUG void sysrq_sched_debug_show(void) { int cpu; @@ -417,6 +455,7 @@ void sysrq_sched_debug_show(void) print_cpu(NULL, cpu); } +#endif /* * This itererator needs some explanation. @@ -547,6 +586,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; + unsigned int load_avg; + + load_avg = pct_task_load(p); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); @@ -598,6 +640,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_passive); P(se.statistics.nr_wakeups_idle); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + __P(load_avg); +#ifdef CONFIG_SCHED_HMP + P(ravg.demand); + P(se.avg.runnable_avg_sum_scaled); +#endif +#endif + { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..958d79e1933c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,9 +31,8 @@ #include <linux/migrate.h> #include <linux/task_work.h> -#include <trace/events/sched.h> - #include "sched.h" +#include <trace/events/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -81,6 +80,14 @@ static unsigned int sched_nr_latency = 8; unsigned int sysctl_sched_child_runs_first __read_mostly; /* + * Controls whether, when SD_SHARE_PKG_RESOURCES is on, if all + * tasks go to idle CPUs when woken. If this is off, note that the + * per-task flag PF_WAKE_UP_IDLE can still cause a task to go to an + * idle CPU upon being woken. + */ +unsigned int __read_mostly sysctl_sched_wake_to_idle; + +/* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -236,6 +243,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#ifdef CONFIG_SMP +static int active_load_balance_cpu_stop(void *data); +#endif const struct sched_class fair_sched_class; @@ -738,12 +748,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; } +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -757,23 +811,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2442,7 +2479,25 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ +u32 sched_get_wake_up_idle(struct task_struct *p) +{ + u32 enabled = p->flags & PF_WAKE_UP_IDLE; + + return !!enabled; +} + +int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) +{ + int enable = !!wake_up_idle; + + if (enable) + p->flags |= PF_WAKE_UP_IDLE; + else + p->flags &= ~PF_WAKE_UP_IDLE; + + return 0; +} + static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, @@ -2522,6 +2577,1709 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta); +static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods); + +struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) +{ + return NULL; +} + +enum sched_boost_type { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; + +#ifdef CONFIG_SCHED_HMP + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_pelt; +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +unsigned int max_task_load(void) +{ + if (sched_use_pelt) + return LOAD_AVG_MAX; + + return sched_ravg_window; +} + +/* Use this knob to turn on or off HMP-aware task placement logic */ +unsigned int __read_mostly sched_enable_hmp = 0; + +/* A cpu can no longer accomodate more tasks if: + * + * rq->nr_running > sysctl_sched_spill_nr_run || + * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load + */ +unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; + +/* + * Place sync wakee tasks those have less than configured demand to the waker's + * cluster. + */ +unsigned int __read_mostly sched_small_wakee_task_load; +unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; + +unsigned int __read_mostly sched_big_waker_task_load; +unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; + +/* + * CPUs with load greater than the sched_spill_load_threshold are not + * eligible for task placement. When all CPUs in a cluster achieve a + * load higher than this level, tasks becomes eligible for inter + * cluster migration. + */ +unsigned int __read_mostly sched_spill_load; +unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; + +/* + * Tasks whose bandwidth consumption on a cpu is more than + * sched_upmigrate are considered "big" tasks. Big tasks will be + * considered for "up" migration, i.e migrating to a cpu with better + * capacity. + */ +unsigned int __read_mostly sched_upmigrate; +unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; + +/* + * Big tasks, once migrated, will need to drop their bandwidth + * consumption to less than sched_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_downmigrate; +unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; + +#define SCHED_UPMIGRATE_MIN_NICE 15 + +/* + * The load scale factor of a CPU gets boosted when its max frequency + * is restricted due to which the tasks are migrating to higher capacity + * CPUs early. The sched_upmigrate threshold is auto-upgraded by + * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. + */ +unsigned int up_down_migrate_scale_factor = 1024; + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ +unsigned int sysctl_sched_boost; + +/* + * Scheduler selects and places task to its previous CPU if sleep time is + * less than sysctl_sched_select_prev_cpu_us. + */ +static unsigned int __read_mostly +sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; +unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; + +static unsigned int __read_mostly +sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; + +unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; + +void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + unsigned int delta; + + if (up_down_migrate_scale_factor == 1024) + goto done; + + delta = up_migrate - down_migrate; + + up_migrate /= NSEC_PER_USEC; + up_migrate *= up_down_migrate_scale_factor; + up_migrate >>= 10; + up_migrate *= NSEC_PER_USEC; + + up_migrate = min(up_migrate, sched_ravg_window); + + down_migrate /= NSEC_PER_USEC; + down_migrate *= up_down_migrate_scale_factor; + down_migrate >>= 10; + down_migrate *= NSEC_PER_USEC; + + down_migrate = min(down_migrate, up_migrate - delta); +done: + sched_upmigrate = up_migrate; + sched_downmigrate = down_migrate; +} + +void set_hmp_defaults(void) +{ + sched_spill_load = + pct_to_real(sysctl_sched_spill_load_pct); + + update_up_down_migrate(); + +#ifdef CONFIG_SCHED_FREQ_INPUT + sched_major_task_runtime = + mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100); +#endif + + sched_init_task_load_pelt = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)LOAD_AVG_MAX, 100); + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * + NSEC_PER_USEC; + + sched_small_wakee_task_load = + div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_big_waker_task_load = + div64_u64((u64)sysctl_sched_big_waker_task_load_pct * + (u64)sched_ravg_window, 100); +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +#ifdef CONFIG_CGROUP_SCHED + +static inline int upmigrate_discouraged(struct task_struct *p) +{ + return task_group(p)->upmigrate_discouraged; +} + +#else + +static inline int upmigrate_discouraged(struct task_struct *p) +{ + return 0; +} + +#endif + +/* Is a task "big" on its current cpu */ +static inline int __is_big_task(struct task_struct *p, u64 scaled_load) +{ + int nice = task_nice(p); + + if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) + return 0; + + return scaled_load > sched_upmigrate; +} + +static inline int is_big_task(struct task_struct *p) +{ + return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); +} + +static inline u64 cpu_load(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); +} + +static inline u64 cpu_load_sync(int cpu, int sync) +{ + return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); +} + +static int boost_refcount; +static DEFINE_SPINLOCK(boost_lock); +static DEFINE_MUTEX(boost_mutex); + +static void boost_kick_cpus(void) +{ + int i; + + for_each_online_cpu(i) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int sched_boost(void) +{ + return boost_refcount > 0; +} + +int sched_set_boost(int enable) +{ + unsigned long flags; + int ret = 0; + int old_refcount; + + if (!sched_enable_hmp) + return -EINVAL; + + spin_lock_irqsave(&boost_lock, flags); + + old_refcount = boost_refcount; + + if (enable == 1) { + boost_refcount++; + } else if (!enable) { + if (boost_refcount >= 1) + boost_refcount--; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (!old_refcount && boost_refcount) + boost_kick_cpus(); + + trace_sched_set_boost(boost_refcount); + spin_unlock_irqrestore(&boost_lock, flags); + + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&boost_mutex); + if (!write) + sysctl_sched_boost = sched_boost(); + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write) + goto done; + + ret = (sysctl_sched_boost <= 1) ? + sched_set_boost(sysctl_sched_boost) : -EINVAL; + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +/* + * Task will fit on a cpu if it's bandwidth consumption on that cpu + * will be less than sched_upmigrate. A big task that was previously + * "up" migrated will be considered fitting on "little" cpu if its + * bandwidth consumption on "little" cpu will be less than + * sched_downmigrate. This will help avoid frequenty migrations for + * tasks with load close to the upmigrate threshold + */ + +static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_type boost_type) +{ + int upmigrate; + + if (cpu_capacity(cpu) == max_capacity) + return 1; + + if (boost_type != SCHED_BOOST_ON_BIG) { + if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) + return 1; + + upmigrate = sched_upmigrate; + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (task_load < upmigrate) + return 1; + } + + return 0; +} + +static enum sched_boost_type sched_boost_type(void) +{ + if (sched_boost()) { + if (min_possible_efficiency != max_possible_efficiency) + return SCHED_BOOST_ON_BIG; + else + return SCHED_BOOST_ON_ALL; + } + return SCHED_BOOST_NONE; +} + +static int task_will_fit(struct task_struct *p, int cpu) +{ + u64 tload = scale_load_to_cpu(task_load(p), cpu); + + return task_load_will_fit(p, tload, cpu, sched_boost_type()); +} + +int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand) +{ + int cpu = cluster_first_cpu(cluster); + int prev_capacity = 0; + unsigned int threshold = sched_upmigrate; + u64 load; + + if (cluster->capacity == max_capacity) + return 1; + + if (grp->preferred_cluster) + prev_capacity = grp->preferred_cluster->capacity; + + if (cluster->capacity < prev_capacity) + threshold = sched_downmigrate; + + load = scale_load_to_cpu(demand, cpu); + if (load < threshold) + return 1; + + return 0; +} + +/* + * Return the cost of running task p on CPU cpu. This function + * currently assumes that task p is the only task which will run on + * the CPU. + */ +unsigned int power_cost(int cpu, u64 demand) +{ + int first, mid, last; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct cpu_pstate_pwr *costs; + struct freq_max_load *max_load; + int total_static_pwr_cost = 0; + struct rq *rq = cpu_rq(cpu); + unsigned int pc; + + if (!per_cpu_info || !per_cpu_info[cpu].ptable) + /* When power aware scheduling is not in use, or CPU + * power data is not available, just use the CPU + * capacity as a rough stand-in for real CPU power + * numbers, assuming bigger CPUs are more power + * hungry. */ + return cpu_max_possible_capacity(cpu); + + rcu_read_lock(); + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (!max_load) { + pc = cpu_max_possible_capacity(cpu); + goto unlock; + } + + costs = per_cpu_info[cpu].ptable; + + if (demand <= max_load->freqs[0].hdemand) { + pc = costs[0].power; + goto unlock; + } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { + pc = costs[max_load->length - 1].power; + goto unlock; + } + + first = 0; + last = max_load->length - 1; + mid = (last - first) >> 1; + while (1) { + if (demand <= max_load->freqs[mid].hdemand) + last = mid; + else + first = mid; + + if (last - first == 1) + break; + mid = first + ((last - first) >> 1); + } + + pc = costs[last].power; + +unlock: + rcu_read_unlock(); + + if (idle_cpu(cpu) && rq->cstate) { + total_static_pwr_cost += rq->static_cpu_pwr_cost; + if (rq->cluster->dstate) + total_static_pwr_cost += + rq->cluster->static_cluster_pwr_cost; + } + + return pc + total_static_pwr_cost; + +} + +struct cpu_select_env { + struct task_struct *p; + struct related_thread_group *rtg; + u8 reason; + u8 need_idle:1; + u8 need_waker_cluster:1; + u8 sync:1; + u8 ignore_prev_cpu:1; + enum sched_boost_type boost_type; + int prev_cpu; + DECLARE_BITMAP(candidate_list, NR_CPUS); + DECLARE_BITMAP(backup_list, NR_CPUS); + u64 task_load; + u64 cpu_load; +}; + +struct cluster_cpu_stats { + int best_idle_cpu, least_loaded_cpu; + int best_capacity_cpu, best_cpu, best_sibling_cpu; + int min_cost, best_sibling_cpu_cost; + int best_cpu_cstate; + u64 min_load, best_load, best_sibling_cpu_load; + s64 highest_spare_capacity; +}; + +#define UP_MIGRATION 1 +#define DOWN_MIGRATION 2 +#define IRQLOAD_MIGRATION 3 + +/* + * Invoked from three places: + * 1) try_to_wake_up() -> ... -> select_best_cpu() + * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu() + * 3) can_migrate_task() + * + * Its safe to de-reference p->grp in first case (since p->pi_lock is held) + * but not in other cases. p->grp is hence freed after a RCU grace period and + * accessed under rcu_read_lock() + */ +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp; + int rc = 0; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (!grp || !sysctl_sched_enable_colocation) + rc = 1; + else + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return rq->cluster; +} + +static int +spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +{ + u64 total_load; + + total_load = env->task_load + env->cpu_load; + + if (total_load > sched_spill_load || + (rq->nr_running + 1) > sysctl_sched_spill_nr_run) + return 1; + + return 0; +} + +static int skip_cpu(int cpu, struct cpu_select_env *env) +{ + int tcpu = task_cpu(env->p); + int skip = 0; + + if (!env->reason) + return 0; + + if (is_reserved(cpu)) + return 1; + + switch (env->reason) { + case UP_MIGRATION: + skip = !idle_cpu(cpu); + break; + case IRQLOAD_MIGRATION: + /* Purposely fall through */ + default: + skip = (cpu == tcpu); + break; + } + + return skip; +} + +static inline int +acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + int tcpu; + + if (!env->reason) + return 1; + + tcpu = task_cpu(env->p); + switch (env->reason) { + case UP_MIGRATION: + return cluster->capacity > cpu_capacity(tcpu); + + case DOWN_MIGRATION: + return cluster->capacity < cpu_capacity(tcpu); + + default: + break; + } + + return 1; +} + +static int +skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + if (!test_bit(cluster->id, env->candidate_list)) + return 1; + + if (!acceptable_capacity(cluster, env)) { + __clear_bit(cluster->id, env->candidate_list); + return 1; + } + + return 0; +} + +static struct sched_cluster * +select_least_power_cluster(struct cpu_select_env *env) +{ + struct sched_cluster *cluster; + + if (env->rtg) { + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(env->rtg->preferred_cluster)); + return env->rtg->preferred_cluster; + } + + for_each_sched_cluster(cluster) { + if (!skip_cluster(cluster, env)) { + int cpu = cluster_first_cpu(cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cpu); + if (task_load_will_fit(env->p, env->task_load, cpu, + env->boost_type)) + return cluster; + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + } + } + + return NULL; +} + +static struct sched_cluster * +next_candidate(const unsigned long *list, int start, int end) +{ + int cluster_id; + + cluster_id = find_next_bit(list, end, start - 1 + 1); + if (cluster_id >= end) + return NULL; + + return sched_cluster[cluster_id]; +} + +static void +update_spare_capacity(struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu, int capacity, + u64 cpu_load) +{ + s64 spare_capacity = sched_ravg_window - cpu_load; + + if (spare_capacity > 0 && + (spare_capacity > stats->highest_spare_capacity || + (spare_capacity == stats->highest_spare_capacity && + ((!env->need_waker_cluster && + capacity > cpu_capacity(stats->best_capacity_cpu)) || + (env->need_waker_cluster && + cpu_rq(cpu)->nr_running < + cpu_rq(stats->best_capacity_cpu)->nr_running))))) { + /* + * If sync waker is the only runnable of CPU, cr_avg of the + * CPU is 0 so we have high chance to place the wakee on the + * waker's CPU which likely causes preemtion of the waker. + * This can lead migration of preempted waker. Place the + * wakee on the real idle CPU when it's possible by checking + * nr_running to avoid such preemption. + */ + stats->highest_spare_capacity = spare_capacity; + stats->best_capacity_cpu = cpu; + } +} + +static inline void find_backup_cluster( +struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + int i; + + while (!bitmap_empty(env->backup_list, num_clusters)) { + next = next_candidate(env->backup_list, 0, num_clusters); + __clear_bit(next->id, env->backup_list); + for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) { + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + update_spare_capacity(stats, env, i, next->capacity, + cpu_load_sync(i, env->sync)); + } + } +} + +struct sched_cluster * +next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, + struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + + __clear_bit(cluster->id, env->candidate_list); + + if (env->rtg && preferred_cluster(cluster, env->p)) + return NULL; + + do { + if (bitmap_empty(env->candidate_list, num_clusters)) + return NULL; + + next = next_candidate(env->candidate_list, 0, num_clusters); + if (next) { + if (next->min_power_cost > stats->min_cost) { + clear_bit(next->id, env->candidate_list); + next = NULL; + continue; + } + + if (skip_cluster(next, env)) + next = NULL; + } + } while (!next); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(next)); + return next; +} + +#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int cpu_cstate; + int prev_cpu = env->prev_cpu; + + cpu_cstate = cpu_rq(cpu)->cstate; + + if (env->need_idle) { + stats->min_cost = cpu_cost; + if (idle_cpu(cpu)) { + if (cpu_cstate < stats->best_cpu_cstate || + (cpu_cstate == stats->best_cpu_cstate && + cpu == prev_cpu)) { + stats->best_idle_cpu = cpu; + stats->best_cpu_cstate = cpu_cstate; + } + } else { + if (env->cpu_load < stats->min_load || + (env->cpu_load == stats->min_load && + cpu == prev_cpu)) { + stats->least_loaded_cpu = cpu; + stats->min_load = env->cpu_load; + } + } + + return; + } + + if (cpu_cost < stats->min_cost) { + stats->min_cost = cpu_cost; + stats->best_cpu_cstate = cpu_cstate; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + return; + } + + /* CPU cost is the same. Start breaking the tie by C-state */ + + if (cpu_cstate > stats->best_cpu_cstate) + return; + + if (cpu_cstate < stats->best_cpu_cstate) { + stats->best_cpu_cstate = cpu_cstate; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + return; + } + + /* C-state is the same. Use prev CPU to break the tie */ + if (cpu == prev_cpu) { + stats->best_cpu = cpu; + return; + } + + if (stats->best_cpu != prev_cpu && + ((cpu_cstate == 0 && env->cpu_load < stats->best_load) || + (cpu_cstate > 0 && env->cpu_load > stats->best_load))) { + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + } +} +#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */ +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int prev_cpu = env->prev_cpu; + + if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) { + if (stats->best_sibling_cpu_cost > cpu_cost || + (stats->best_sibling_cpu_cost == cpu_cost && + stats->best_sibling_cpu_load > env->cpu_load)) { + stats->best_sibling_cpu_cost = cpu_cost; + stats->best_sibling_cpu_load = env->cpu_load; + stats->best_sibling_cpu = cpu; + } + } + + if ((cpu_cost < stats->min_cost) || + ((stats->best_cpu != prev_cpu && + stats->min_load > env->cpu_load) || cpu == prev_cpu)) { + if (env->need_idle) { + if (idle_cpu(cpu)) { + stats->min_cost = cpu_cost; + stats->best_idle_cpu = cpu; + } + } else { + stats->min_cost = cpu_cost; + stats->min_load = env->cpu_load; + stats->best_cpu = cpu; + } + } +} +#endif + +static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env) +{ + int cpu_cost; + + cpu_cost = power_cost(cpu, task_load(env->p) + + cpu_cravg_sync(cpu, env->sync)); + if (cpu_cost <= stats->min_cost) + __update_cluster_stats(cpu, stats, env, cpu_cost); +} + +static void find_best_cpu_in_cluster(struct sched_cluster *c, + struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int i; + struct cpumask search_cpus; + + cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); + if (env->ignore_prev_cpu) + cpumask_clear_cpu(env->prev_cpu, &search_cpus); + + for_each_cpu(i, &search_cpus) { + env->cpu_load = cpu_load_sync(i, env->sync); + + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + if (unlikely(!cpu_active(i)) || skip_cpu(i, env)) + continue; + + update_spare_capacity(stats, env, i, c->capacity, + env->cpu_load); + + if (env->boost_type == SCHED_BOOST_ON_ALL || + env->need_waker_cluster || + sched_cpu_high_irqload(i) || + spill_threshold_crossed(env, cpu_rq(i))) + continue; + + update_cluster_stats(i, stats, env); + } +} + +static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats) +{ + stats->best_cpu = stats->best_idle_cpu = -1; + stats->best_capacity_cpu = stats->best_sibling_cpu = -1; + stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX; + stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX; + stats->highest_spare_capacity = 0; + stats->least_loaded_cpu = -1; + stats->best_cpu_cstate = INT_MAX; + /* No need to initialize stats->best_load */ +} + +/* + * Should task be woken to any available idle cpu? + * + * Waking tasks to idle cpu has mixed implications on both performance and + * power. In many cases, scheduler can't estimate correctly impact of using idle + * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel + * module to pass a strong hint to scheduler that the task in question should be + * woken to idle cpu, generally to improve performance. + */ +static inline int wake_to_idle(struct task_struct *p) +{ + return (current->flags & PF_WAKE_UP_IDLE) || + (p->flags & PF_WAKE_UP_IDLE); +} + +static inline bool +bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int prev_cpu; + struct task_struct *task = env->p; + struct sched_cluster *cluster; + + if (env->boost_type != SCHED_BOOST_NONE || env->reason || + !task->ravg.mark_start || + env->need_idle || !sched_short_sleep_task_threshold) + return false; + + prev_cpu = env->prev_cpu; + if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || + unlikely(!cpu_active(prev_cpu))) + return false; + + if (task->ravg.mark_start - task->last_cpu_selected_ts >= + sched_long_cpu_selection_threshold) + return false; + + /* + * This function should be used by task wake up path only as it's + * assuming p->last_switch_out_ts as last sleep time. + * p->last_switch_out_ts can denote last preemption time as well as + * last sleep time. + */ + if (task->ravg.mark_start - task->last_switch_out_ts >= + sched_short_sleep_task_threshold) + return false; + + env->task_load = scale_load_to_cpu(task_load(task), prev_cpu); + cluster = cpu_rq(prev_cpu)->cluster; + + if (!task_load_will_fit(task, env->task_load, prev_cpu, + sched_boost_type())) { + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + return false; + } + + env->cpu_load = cpu_load_sync(prev_cpu, env->sync); + if (sched_cpu_high_irqload(prev_cpu) || + spill_threshold_crossed(env, cpu_rq(prev_cpu))) { + update_spare_capacity(stats, env, prev_cpu, + cluster->capacity, env->cpu_load); + env->ignore_prev_cpu = 1; + return false; + } + + return true; +} + +static inline bool +wake_to_waker_cluster(struct cpu_select_env *env) +{ + return !env->need_idle && !env->reason && env->sync && + task_load(current) > sched_big_waker_task_load && + task_load(env->p) < sched_small_wakee_task_load; +} + +static inline int +cluster_allowed(struct task_struct *p, struct sched_cluster *cluster) +{ + cpumask_t tmp_mask; + + cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask); + cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed); + + return !cpumask_empty(&tmp_mask); +} + + +/* return cheapest cpu that can fit this task */ +static int select_best_cpu(struct task_struct *p, int target, int reason, + int sync) +{ + struct sched_cluster *cluster, *pref_cluster = NULL; + struct cluster_cpu_stats stats; + bool fast_path = false; + struct related_thread_group *grp; + + struct cpu_select_env env = { + .p = p, + .reason = reason, + .need_idle = wake_to_idle(p), + .need_waker_cluster = 0, + .boost_type = sched_boost_type(), + .sync = sync, + .prev_cpu = target, + .ignore_prev_cpu = 0, + .rtg = NULL, + }; + + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); + bitmap_zero(env.backup_list, NR_CPUS); + + init_cluster_cpu_stats(&stats); + + rcu_read_lock(); + + grp = task_related_thread_group(p); + + if (grp && grp->preferred_cluster) { + pref_cluster = grp->preferred_cluster; + if (!cluster_allowed(p, pref_cluster)) + clear_bit(pref_cluster->id, env.candidate_list); + else + env.rtg = grp; + } else { + cluster = cpu_rq(smp_processor_id())->cluster; + if (wake_to_waker_cluster(&env) && + cluster_allowed(p, cluster)) { + env.need_waker_cluster = 1; + bitmap_zero(env.candidate_list, NR_CPUS); + __set_bit(cluster->id, env.candidate_list); + } else if (bias_to_prev_cpu(&env, &stats)) { + fast_path = true; + goto out; + } + } + +retry: + cluster = select_least_power_cluster(&env); + + if (!cluster) + goto out; + + /* + * 'cluster' now points to the minimum power cluster which can satisfy + * task's perf goals. Walk down the cluster list starting with that + * cluster. For non-small tasks, skip clusters that don't have + * mostly_idle/idle cpus + */ + + do { + find_best_cpu_in_cluster(cluster, &env, &stats); + + } while ((cluster = next_best_cluster(cluster, &env, &stats))); + + if (env.need_idle) { + if (stats.best_idle_cpu >= 0) + target = stats.best_idle_cpu; + else if (stats.least_loaded_cpu >= 0) + target = stats.least_loaded_cpu; + } else if (stats.best_cpu >= 0) { + if (stats.best_cpu != task_cpu(p) && + stats.min_cost == stats.best_sibling_cpu_cost) + stats.best_cpu = stats.best_sibling_cpu; + + target = stats.best_cpu; + } else { + if (env.rtg) { + env.rtg = NULL; + goto retry; + } + + find_backup_cluster(&env, &stats); + if (stats.best_capacity_cpu >= 0) + target = stats.best_capacity_cpu; + } + p->last_cpu_selected_ts = sched_ktime_clock(); + +out: + rcu_read_unlock(); + trace_sched_task_load(p, sched_boost(), env.reason, env.sync, + env.need_idle, fast_path, target); + return target; +} + +static void +inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks++; +} + +static void +dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks--; + + BUG_ON(stats->nr_big_tasks < 0); +} + +static void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) +{ + stats->nr_big_tasks = 0; + if (reset_cra) { + stats->cumulative_runnable_avg = 0; + set_pred_demands_sum(stats, 0); + } +} + + +#ifdef CONFIG_CFS_BANDWIDTH + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); + + return (&tg->list == &task_groups) ? NULL : tg; +} + +/* Iterate over all cfs_rq in a cpu */ +#define for_each_cfs_rq(cfs_rq, tg, cpu) \ + for (tg = container_of(&task_groups, struct task_group, list); \ + ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));) + +static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + + for_each_cfs_rq(cfs_rq, tg, cpu) + reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra); + + rcu_read_unlock(); +} + +#else /* CONFIG_CFS_BANDWIDTH */ + +static inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/* + * Return total number of tasks "eligible" to run on highest capacity cpu + * + * This is simply nr_big_tasks for cpus which are not of max_capacity and + * nr_running for cpus of max_capacity + */ +unsigned int nr_eligible_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + int nr_big = rq->hmp_stats.nr_big_tasks; + int nr = rq->nr_running; + + if (cpu_max_possible_capacity(cpu) != max_possible_capacity) + return nr_big; + + return nr; +} + +/* + * reset_cpu_hmp_stats - reset HMP stats for a cpu + * nr_big_tasks + * cumulative_runnable_avg (iff reset_cra is true) + */ +void reset_cpu_hmp_stats(int cpu, int reset_cra) +{ + reset_cfs_rq_hmp_stats(cpu, reset_cra); + reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); +} + +static void +fixup_nr_big_tasks(struct hmp_sched_stats *stats, struct task_struct *p, + s64 delta) +{ + u64 new_task_load; + u64 old_task_load; + + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); + new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); + + if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) + stats->nr_big_tasks--; + else if (!__is_big_task(p, old_task_load) && + __is_big_task(p, new_task_load)) + stats->nr_big_tasks++; + + BUG_ON(stats->nr_big_tasks < 0); +} + + +#ifdef CONFIG_CFS_BANDWIDTH + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); + +/* Add task's contribution to a cpu' HMP statistics */ +static void +_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* + * Although below check is not strictly required (as + * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called + * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on + * efficiency by short-circuiting for_each_sched_entity() loop when + * !sched_enable_hmp || sched_disable_window_stats + */ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + inc_rq_hmp_stats(rq, p, change_cra); +} + +/* Remove task's contribution from a cpu' HMP statistics */ +static void +_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* See comment on efficiency in _inc_hmp_sched_stats_fair */ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + dec_rq_hmp_stats(rq, p, change_cra); +} + +static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _inc_hmp_sched_stats_fair(rq, p, 1); +} + +static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _dec_hmp_sched_stats_fair(rq, p, 1); +} + +static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); + } +} + +static int task_will_be_throttled(struct task_struct *p); + +#else /* CONFIG_CFS_BANDWIDTH */ + +static void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + inc_nr_big_task(&rq->hmp_stats, p); + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + dec_nr_big_task(&rq->hmp_stats, p); + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} +static void +fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); +} + +static inline int task_will_be_throttled(struct task_struct *p) +{ + return 0; +} + +static void +_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/* + * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. + */ +static void update_nr_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + /* Do not reset cumulative_runnable_avg */ + reset_cpu_hmp_stats(cpu, 0); + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) + _inc_hmp_sched_stats_fair(rq, p, 0); +} + +/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ +void pre_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + local_irq_disable(); + + for_each_cpu(i, cpus) + raw_spin_lock(&cpu_rq(i)->lock); +} + +/* + * Reinitialize 'nr_big_tasks' counters on all affected cpus + */ +void post_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + /* Assumes local_irq_disable() keeps online cpumap stable */ + for_each_cpu(i, cpus) + update_nr_big_tasks(i); + + for_each_cpu(i, cpus) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_enable(); +} + +DEFINE_MUTEX(policy_mutex); + +#ifdef CONFIG_SCHED_FREQ_INPUT +static inline int invalid_value_freq_input(unsigned int *data) +{ + if (data == &sysctl_sched_freq_aggregate) + return !(*data == 0 || *data == 1); + + return 0; +} +#else +static inline int invalid_value_freq_input(unsigned int *data) +{ + return 0; +} +#endif + +static inline int invalid_value(unsigned int *data) +{ + unsigned int val = *data; + + if (data == &sysctl_sched_ravg_hist_size) + return (val < 2 || val > RAVG_HIST_SIZE_MAX); + + if (data == &sysctl_sched_window_stats_policy) + return val >= WINDOW_STATS_INVALID_POLICY; + + return invalid_value_freq_input(data); +} + +/* + * Handle "atomic" update of sysctl_sched_window_stats_policy, + * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables. + */ +int sched_window_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write || (write && (old_val == *data))) + goto done; + + if (invalid_value(data)) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + reset_all_window_stats(0, 0); + +done: + mutex_unlock(&policy_mutex); + + return ret; +} + +/* + * Convert percentage value into absolute form. This will avoid div() operation + * in fast path, to convert task load in percentage scale. + */ +int sched_hmp_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int old_val; + unsigned int *data = (unsigned int *)table->data; + int update_min_nice = 0; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write || !sched_enable_hmp) + goto done; + + if (write && (old_val == *data)) + goto done; + + if (data != &sysctl_sched_select_prev_cpu_us) { + /* + * all tunables other than sched_select_prev_cpu_us are + * in percentage. + */ + if (sysctl_sched_downmigrate_pct > + sysctl_sched_upmigrate_pct || *data > 100) { + *data = old_val; + ret = -EINVAL; + goto done; + } + } + + /* + * Big task tunable change will need to re-classify tasks on + * runqueue as big and set their counters appropriately. + * sysctl interface affects secondary variables (*_pct), which is then + * "atomically" carried over to the primary variables. Atomic change + * includes taking runqueue lock of all online cpus and re-initiatizing + * their big counter values based on changed criteria. + */ + if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + } + + set_hmp_defaults(); + + if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + } + +done: + mutex_unlock(&policy_mutex); + return ret; +} + +/* + * Reset balance_interval at all sched_domain levels of given cpu, so that it + * honors kick. + */ +static inline void reset_balance_interval(int cpu) +{ + struct sched_domain *sd; + + if (cpu >= nr_cpu_ids) + return; + + rcu_read_lock(); + for_each_domain(cpu, sd) + sd->balance_interval = 0; + rcu_read_unlock(); +} + +/* + * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal + * cpu as per its demand or priority) + * + * Returns reason why task needs to be migrated + */ +static inline int migration_needed(struct task_struct *p, int cpu) +{ + int nice; + struct related_thread_group *grp; + + if (!sched_enable_hmp || p->state != TASK_RUNNING) + return 0; + + /* No need to migrate task that is about to be throttled */ + if (task_will_be_throttled(p)) + return 0; + + if (sched_boost_type() == SCHED_BOOST_ON_BIG) { + if (cpu_capacity(cpu) != max_capacity) + return UP_MIGRATION; + return 0; + } + + if (sched_cpu_high_irqload(cpu)) + return IRQLOAD_MIGRATION; + + nice = task_nice(p); + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) { + rcu_read_unlock(); + return DOWN_MIGRATION; + } + + if (!grp && !task_will_fit(p, cpu)) { + rcu_read_unlock(); + return UP_MIGRATION; + } + rcu_read_unlock(); + + return 0; +} + +static DEFINE_RAW_SPINLOCK(migration_lock); + +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (new_cpu != cpu) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} + +static inline int nr_big_tasks(struct rq *rq) +{ + return rq->hmp_stats.nr_big_tasks; +} + +unsigned int cpu_temp(int cpu) +{ + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + if (per_cpu_info) + return per_cpu_info[cpu].temp; + else + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +struct cpu_select_env; +struct sched_cluster; + +static inline int task_will_fit(struct task_struct *p, int cpu, + enum sched_boost_type boost_type) +{ + return 1; +} + +static inline int select_best_cpu(struct task_struct *p, int target, + int reason, int sync) +{ + return 0; +} + +unsigned int power_cost(int cpu, u64 demand) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline int +spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +{ + return 0; +} + +static inline int sched_boost(void) +{ + return 0; +} + +static inline int is_big_task(struct task_struct *p) +{ + return 0; +} + +static inline int nr_big_tasks(struct rq *rq) +{ + return 0; +} + +static inline int is_cpu_throttling_imminent(int cpu) +{ + return 0; +} + +static inline int is_task_migration_throttled(struct task_struct *p) +{ + return 0; +} + +unsigned int cpu_temp(int cpu) +{ + return 0; +} + +static inline void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } +static inline void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return 1; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return NULL; +} + +#endif /* CONFIG_SCHED_HMP */ + + + #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 #error "load tracking assumes 2^10 as unit" #endif @@ -2564,6 +4322,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq, scale_cpu; + struct sched_entity *se = NULL; delta = now - sa->last_update_time; /* @@ -2584,6 +4343,12 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; + if (sched_use_pelt && cfs_rq && weight) { + se = container_of(sa, struct sched_entity, avg); + if (entity_is_task(se) && se->on_rq) + dec_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se)); + } + scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); @@ -2604,6 +4369,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scaled_delta_w = cap_scale(delta_w, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta_w; + add_to_scaled_stat(cpu, sa, delta_w); if (cfs_rq) { cfs_rq->runnable_load_sum += weight * scaled_delta_w; @@ -2630,6 +4396,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, contrib = cap_scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; + add_to_scaled_stat(cpu, sa, contrib); if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } @@ -2641,9 +4408,14 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scaled_delta = cap_scale(delta, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta; + add_to_scaled_stat(cpu, sa, delta); if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } + + if (se && entity_is_task(se) && se->on_rq) + inc_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se)); + if (running) sa->util_sum += scaled_delta * scale_cpu; @@ -2884,8 +4656,192 @@ static inline int idle_balance(struct rq *rq) return 0; } +static inline void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } +static inline void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +#ifdef CONFIG_SCHED_FREQ_INPUT +#define clear_ravg_pred_demand() (p->ravg.pred_demand = 0) +#else +#define clear_ravg_pred_demand() +#endif + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_pelt = sched_init_task_load_pelt; + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); + memset(&p->ravg, 0, sizeof(struct ravg)); + p->cpu_cycles = 0; + + if (init_load_pct) { + init_load_pelt = div64_u64((u64)init_load_pct * + (u64)LOAD_AVG_MAX, 100); + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + clear_ravg_pred_demand(); + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; + p->se.avg.runnable_avg_sum_scaled = init_load_pelt; +} + +#else /* CONFIG_SCHED_HMP */ + +void init_new_task_load(struct task_struct *p) +{ +} + +#endif /* CONFIG_SCHED_HMP */ + +#ifdef CONFIG_SCHED_HMP + +/* Return task demand in percentage scale */ +unsigned int pct_task_load(struct task_struct *p) +{ + unsigned int load; + + load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); + + return load; +} + +/* + * Add scaled version of 'delta' to runnable_avg_sum_scaled + * 'delta' is scaled in reference to "best" cpu + */ +static inline void +add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta) +{ + int cur_freq = cpu_cur_freq(cpu); + u64 scaled_delta; + int sf; + + if (!sched_enable_hmp) + return; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = max_possible_freq; + + scaled_delta = div64_u64(delta * cur_freq, max_possible_freq); + sf = (cpu_efficiency(cpu) * 1024) / max_possible_efficiency; + scaled_delta *= sf; + scaled_delta >>= 10; + sa->runnable_avg_sum_scaled += scaled_delta; +} + +static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods) +{ + if (!sched_enable_hmp) + return; + + sa->runnable_avg_sum_scaled = + decay_load(sa->runnable_avg_sum_scaled, + periods); +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->hmp_stats.nr_big_tasks = 0; + cfs_rq->hmp_stats.cumulative_runnable_avg = 0; + set_pred_demands_sum(&cfs_rq->hmp_stats, 0); +} + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg += + cfs_rq->hmp_stats.cumulative_runnable_avg; + set_pred_demands_sum(stats, stats->pred_demands_sum + + cfs_rq->hmp_stats.pred_demands_sum); +} + +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg -= + cfs_rq->hmp_stats.cumulative_runnable_avg; + set_pred_demands_sum(stats, stats->pred_demands_sum - + cfs_rq->hmp_stats.pred_demands_sum); + + BUG_ON(stats->nr_big_tasks < 0 || + (s64)stats->cumulative_runnable_avg < 0); + verify_pred_demands_sum(stats); +} + +#else /* CONFIG_CFS_BANDWIDTH */ + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#endif /* CONFIG_CFS_BANDWIDTH */ + +#else /* CONFIG_SCHED_HMP */ + +static inline void +add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta) +{ +} + +static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods) +{ +} + +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ +} + +static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ +} + +#endif /* CONFIG_SCHED_HMP */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -2931,6 +4887,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -3498,6 +5455,33 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +/* + * Check if task is part of a hierarchy where some cfs_rq does not have any + * runtime left. + * + * We can't rely on throttled_hierarchy() to do this test, as + * cfs_rq->throttle_count will not be updated yet when this function is called + * from scheduler_tick() + */ +static int task_will_be_throttled(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq; + + if (!cfs_bandwidth_used()) + return 0; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + if (!cfs_rq->runtime_enabled) + continue; + if (cfs_rq->runtime_remaining <= 0) + return 1; + } + + return 0; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -3577,13 +5561,20 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; +#ifdef CONFIG_SCHED_HMP + dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq); +#endif if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); +#ifdef CONFIG_SCHED_HMP + dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq); +#endif + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -3604,6 +5595,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); + + /* Log effect on hmp stats after throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -3613,6 +5610,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta; + struct cfs_rq *tcfs_rq = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -3640,17 +5638,30 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; +#ifdef CONFIG_SCHED_HMP + inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq); +#endif if (cfs_rq_throttled(cfs_rq)) break; } - if (!se) + if (!se) { add_nr_running(rq, task_delta); +#ifdef CONFIG_SCHED_HMP + inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq); +#endif + } /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); + + /* Log effect on hmp stats after un-throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -3971,6 +5982,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + init_cfs_rq_hmp_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4086,7 +6098,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4102,8 +6114,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) /* * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * current task is from our class. */ static void hrtick_update(struct rq *rq) { @@ -4112,8 +6123,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -4152,6 +6162,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; } @@ -4159,6 +6170,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4167,9 +6179,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + inc_rq_hmp_stats(rq, p, 1); + } hrtick_update(rq); } @@ -4199,6 +6212,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4219,6 +6233,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4227,9 +6242,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); - + dec_rq_hmp_stats(rq, p, 1); + } hrtick_update(rq); } @@ -4849,6 +6865,11 @@ static int select_idle_sibling(struct task_struct *p, int target) if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) return i; + if (!sysctl_sched_wake_to_idle && + !(current->flags & PF_WAKE_UP_IDLE) && + !(p->flags & PF_WAKE_UP_IDLE)) + return target; + /* * Otherwise, iterate the domains and find an elegible idle cpu. */ @@ -4931,6 +6952,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; + if (sched_enable_hmp) + return select_best_cpu(p, prev_cpu, 0, sync); + if (sd_flag & SD_BALANCE_WAKE) want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); @@ -5518,6 +7542,13 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40 +#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 +#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \ + LBF_BIG_TASK_ACTIVE_BALANCE) +#define LBF_IGNORE_BIG_TASKS 0x100 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 struct lb_env { struct sched_domain *sd; @@ -5534,6 +7565,8 @@ struct lb_env { long imbalance; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; + unsigned int busiest_grp_capacity; + unsigned int busiest_nr_running; unsigned int flags; @@ -5545,6 +7578,9 @@ struct lb_env { struct list_head tasks; }; +static DEFINE_PER_CPU(bool, dbs_boost_needed); +static DEFINE_PER_CPU(int, dbs_boost_load_moved); + /* * Is this task likely cache-hot: */ @@ -5640,6 +7676,7 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int twf, group_cpus; lockdep_assert_held(&env->src_rq->lock); @@ -5686,6 +7723,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) && + nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + twf = task_will_fit(p, env->dst_cpu); + + /* + * Attempt to not pull tasks that don't fit. We may get lucky and find + * one that actually fits. + */ + if (env->flags & LBF_IGNORE_BIG_TASKS && !twf) + return 0; + + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p)) + return 0; + + /* + * Group imbalance can sometimes cause work to be pulled across groups + * even though the group could have managed the imbalance on its own. + * Prevent inter-cluster migrations for big tasks when the number of + * tasks is lower than the capacity of the group. + */ + group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity, + SCHED_CAPACITY_SCALE); + if (!twf && env->busiest_nr_running <= group_cpus) + return 0; + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; @@ -5693,15 +7758,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) IDLE or NEWLY_IDLE balance. + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); - if (tsk_cache_hot <= 0 || + if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); @@ -5721,9 +7787,13 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + if (rcu_access_pointer(p->grp)) + env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; + double_unlock_balance(env->src_rq, env->dst_rq); } /* @@ -5751,6 +7821,8 @@ static struct task_struct *detach_one_task(struct lb_env *env) * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); + per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); + return p; } return NULL; @@ -5770,12 +7842,20 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; unsigned long load; int detached = 0; + int orig_loop = env->loop; lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) && + !sched_boost()) + env->flags |= LBF_IGNORE_BIG_TASKS; + else if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + +redo: while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -5814,6 +7894,7 @@ static int detach_tasks(struct lb_env *env) detached++; env->imbalance -= load; + per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); #ifdef CONFIG_PREEMPT /* @@ -5837,6 +7918,15 @@ next: list_move_tail(&p->se.group_node, tasks); } + if (env->flags & (LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) { + tasks = &env->src_rq->cfs_tasks; + env->flags &= ~(LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS); + env->loop = orig_loop; + goto redo; + } + /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -5855,9 +7945,11 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); + if (task_notify_on_migrate(p)) + per_cpu(dbs_boost_needed, task_cpu(p)) = true; } /* @@ -6001,6 +8093,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ +#ifdef CONFIG_SCHED_HMP + unsigned long sum_nr_big_tasks; + u64 group_cpu_load; /* Scaled load of all CPUs of the group */ +#endif unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -6047,6 +8143,56 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } +#ifdef CONFIG_SCHED_HMP + +static int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + int local_cpu, busiest_cpu; + int local_capacity, busiest_capacity; + int local_pwr_cost, busiest_pwr_cost; + int nr_cpus; + + if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + return 0; + + local_cpu = group_first_cpu(sds->local); + busiest_cpu = group_first_cpu(sds->busiest); + + local_capacity = cpu_max_possible_capacity(local_cpu); + busiest_capacity = cpu_max_possible_capacity(busiest_cpu); + + local_pwr_cost = cpu_max_power_cost(local_cpu); + busiest_pwr_cost = cpu_max_power_cost(busiest_cpu); + + if (local_capacity < busiest_capacity || + (local_capacity == busiest_capacity && + local_pwr_cost <= busiest_pwr_cost)) + return 0; + + if (local_capacity > busiest_capacity && + sds->busiest_stat.sum_nr_big_tasks) + return 0; + + nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest)); + if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) && + (sds->busiest_stat.sum_nr_running < + nr_cpus * sysctl_sched_spill_nr_run)) + return 1; + + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -6276,7 +8422,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline enum group_type group_classify(struct sched_group *group, - struct sg_lb_stats *sgs) + struct sg_lb_stats *sgs, struct lb_env *env) { if (sgs->group_no_capacity) return group_overloaded; @@ -6309,6 +8455,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, 0), + cpu_temp(i)); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -6322,6 +8473,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (rq->nr_running > 1) *overload = true; +#ifdef CONFIG_SCHED_HMP + sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks; + sgs->group_cpu_load += cpu_load(i); +#endif + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -6341,8 +8497,40 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(group, sgs, env); +} + +#ifdef CONFIG_SCHED_HMP +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + if (env->idle != CPU_NOT_IDLE && + cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { + if (sched_boost() && !sds->busiest && sgs->sum_nr_running) { + env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE; + return true; + } + + if (sgs->sum_nr_big_tasks > + sds->busiest_stat.sum_nr_big_tasks) { + env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; + return true; + } + } + + return false; +} +#else +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + return false; } +#endif /** * update_sd_pick_busiest - return 1 on busiest group @@ -6364,6 +8552,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs)) + return true; + if (sgs->group_type > busiest->group_type) return true; @@ -6475,12 +8666,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); + sgs->group_type = group_classify(sg, sgs, env); } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; + env->busiest_nr_running = sgs->sum_nr_running; + env->busiest_grp_capacity = sgs->group_capacity; } next_group: @@ -6732,6 +8925,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + if (env->flags & LBF_HMP_ACTIVE_BALANCE) + goto force_balance; + + if (bail_inter_cluster_balance(env, &sds)) + goto out_balanced; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -6793,6 +8992,57 @@ out_balanced: return NULL; } +#ifdef CONFIG_SCHED_HMP +static struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + struct rq *busiest = NULL, *busiest_big = NULL; + u64 max_runnable_avg = 0, max_runnable_avg_big = 0; + int max_nr_big = 0, nr_big; + bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE); + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + struct rq *rq = cpu_rq(i); + u64 cumulative_runnable_avg = + rq->hmp_stats.cumulative_runnable_avg; + + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + + if (find_big) { + nr_big = nr_big_tasks(rq); + if (nr_big > max_nr_big || + (nr_big > 0 && nr_big == max_nr_big && + cumulative_runnable_avg > max_runnable_avg_big)) { + max_runnable_avg_big = cumulative_runnable_avg; + busiest_big = rq; + max_nr_big = nr_big; + continue; + } + } + + if (cumulative_runnable_avg > max_runnable_avg) { + max_runnable_avg = cumulative_runnable_avg; + busiest = rq; + } + } + + if (busiest_big) + return busiest_big; + + env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE; + return busiest; +} +#else +static inline struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + return NULL; +} +#endif + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -6803,6 +9053,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; + if (sched_enable_hmp) + return find_busiest_queue_hmp(env, group); + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -6870,15 +9123,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ -#define MAX_PINNED_INTERVAL 512 +#define MAX_PINNED_INTERVAL 16 /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +#define NEED_ACTIVE_BALANCE_THRESHOLD 10 + static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (env->flags & LBF_HMP_ACTIVE_BALANCE) + return 1; + if (env->idle == CPU_NEWLY_IDLE) { /* @@ -6903,11 +9161,10 @@ static int need_active_balance(struct lb_env *env) return 1; } - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); + return unlikely(sd->nr_balance_failed > + sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } -static int active_load_balance_cpu_stop(void *data); - static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; @@ -6950,10 +9207,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { - int ld_moved, cur_ld_moved, active_balance = 0; + int ld_moved = 0, cur_ld_moved, active_balance = 0; struct sched_domain *sd_parent = sd->parent; - struct sched_group *group; - struct rq *busiest; + struct sched_group *group = NULL; + struct rq *busiest = NULL; unsigned long flags; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); @@ -6967,6 +9224,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, }; /* @@ -6978,6 +9240,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); + per_cpu(dbs_boost_load_moved, this_cpu) = 0; schedstat_inc(sd, lb_count[idle]); redo: @@ -7019,6 +9282,13 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + /* The world might have changed. Validate assumptions */ + if (busiest->nr_running <= 1) { + raw_spin_unlock_irqrestore(&busiest->lock, flags); + env.flags &= ~LBF_ALL_PINNED; + goto no_move; + } + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations @@ -7106,15 +9376,19 @@ more_balance: } } +no_move: if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + if (!(env.flags & LBF_HMP_ACTIVE_BALANCE)) + schedstat_inc(sd, lb_failed[idle]); + /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + !(env.flags & LBF_HMP_ACTIVE_BALANCE)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -7148,17 +9422,45 @@ more_balance: stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + *continue_balancing = 0; } /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + sd->nr_balance_failed = + sd->cache_nice_tries + + NEED_ACTIVE_BALANCE_THRESHOLD - 1; } - } else + } else { sd->nr_balance_failed = 0; + if (per_cpu(dbs_boost_needed, this_cpu)) { + struct migration_notify_data mnd; + + mnd.src_cpu = cpu_of(busiest); + mnd.dest_cpu = this_cpu; + mnd.load = per_cpu(dbs_boost_load_moved, this_cpu); + if (mnd.load > 100) + mnd.load = 100; + atomic_notifier_call_chain(&migration_notifier_head, + 0, (void *)&mnd); + per_cpu(dbs_boost_needed, this_cpu) = false; + per_cpu(dbs_boost_load_moved, this_cpu) = 0; + } + + /* Assumes one 'busiest' cpu that we pulled tasks from */ + if (!same_freq_domain(this_cpu, cpu_of(busiest))) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + + check_for_freq_change(this_rq, false, check_groups); + check_for_freq_change(busiest, false, check_groups); + } else { + check_for_freq_change(this_rq, true, false); + } + } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -7206,6 +9508,11 @@ out_one_pinned: ld_moved = 0; out: + trace_sched_load_balance(this_cpu, idle, *continue_balancing, + group ? group->cpumask[0] : 0, + busiest ? busiest->nr_running : 0, + env.imbalance, env.flags, ld_moved, + sd->balance_interval); return ld_moved; } @@ -7301,9 +9608,12 @@ static int idle_balance(struct rq *this_rq) /* * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. + * now runnable tasks on the balance rq or if + * continue_balancing has been unset (only possible + * due to active migration). */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + !continue_balancing) break; } rcu_read_unlock(); @@ -7350,11 +9660,28 @@ static int active_load_balance_cpu_stop(void *data) int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; + struct sched_domain *sd = NULL; struct task_struct *p = NULL; + struct task_struct *push_task; + int push_task_detached = 0; + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .flags = 0, + .loop = 0, + }; + bool moved = false; raw_spin_lock_irq(&busiest_rq->lock); + per_cpu(dbs_boost_load_moved, target_cpu) = 0; + /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) @@ -7371,6 +9698,20 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; + if (push_task) { + if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && + task_cpu(push_task) == busiest_cpu && + cpu_online(target_cpu)) { + detach_task(push_task, &env); + push_task_detached = 1; + moved = true; + } + goto out_unlock; + } + /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -7380,33 +9721,63 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - + env.sd = sd; schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + moved = true; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; + + if (push_task) + busiest_rq->push_task = NULL; + raw_spin_unlock(&busiest_rq->lock); + if (push_task) { + if (push_task_detached) + attach_one_task(target_rq, push_task); + put_task_struct(push_task); + clear_reserved(target_cpu); + } + if (p) attach_one_task(target_rq, p); local_irq_enable(); + if (moved && !same_freq_domain(busiest_cpu, target_cpu)) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + check_for_freq_change(busiest_rq, false, check_groups); + check_for_freq_change(target_rq, false, check_groups); + } else if (moved) { + check_for_freq_change(target_rq, true, false); + } + + if (per_cpu(dbs_boost_needed, target_cpu)) { + struct migration_notify_data mnd; + + mnd.src_cpu = cpu_of(busiest_rq); + mnd.dest_cpu = target_cpu; + mnd.load = per_cpu(dbs_boost_load_moved, target_cpu); + if (mnd.load > 100) + mnd.load = 100; + atomic_notifier_call_chain(&migration_notifier_head, + 0, (void *)&mnd); + + per_cpu(dbs_boost_needed, target_cpu) = false; + per_cpu(dbs_boost_load_moved, target_cpu) = 0; + } return 0; } @@ -7428,9 +9799,50 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(void) +#ifdef CONFIG_SCHED_HMP +static inline int find_new_hmp_ilb(int type) +{ + int call_cpu = raw_smp_processor_id(); + struct sched_domain *sd; + int ilb; + + rcu_read_lock(); + + /* Pick an idle cpu "closest" to call_cpu */ + for_each_domain(call_cpu, sd) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + sched_domain_span(sd)) { + if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT || + (hmp_capable() && + cpu_max_possible_capacity(ilb) <= + cpu_max_possible_capacity(call_cpu)) || + cpu_max_power_cost(ilb) <= + cpu_max_power_cost(call_cpu))) { + rcu_read_unlock(); + reset_balance_interval(ilb); + return ilb; + } + } + } + + rcu_read_unlock(); + return nr_cpu_ids; +} +#else /* CONFIG_SCHED_HMP */ +static inline int find_new_hmp_ilb(int type) +{ + return 0; +} +#endif /* CONFIG_SCHED_HMP */ + +static inline int find_new_ilb(int type) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int ilb; + + if (sched_enable_hmp) + return find_new_hmp_ilb(type); + + ilb = cpumask_first(nohz.idle_cpus_mask); if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -7443,13 +9855,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int type) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(type); if (ilb_cpu >= nr_cpu_ids) return; @@ -7734,6 +10146,70 @@ end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } +#ifdef CONFIG_SCHED_HMP +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + struct sched_domain *sd; + int i; + + if (rq->nr_running < 2) + return 0; + + if (!sysctl_sched_restrict_cluster_spill || sched_boost()) + return 1; + + if (hmp_capable() && cpu_max_possible_capacity(cpu) == + max_possible_capacity) + return 1; + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(rq->sd); + if (!sd) { + rcu_read_unlock(); + return 0; + } + + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu_load(i) < sched_spill_load && + cpu_rq(i)->nr_running < + sysctl_sched_spill_nr_run) { + /* Change the kick type to limit to CPUs that + * are of equal or lower capacity. + */ + *type = NOHZ_KICK_RESTRICT; + break; + } + } + rcu_read_unlock(); + return 1; +} +#else +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + return 0; +} +#endif + +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) +{ + unsigned long now = jiffies; + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; + + if (sched_enable_hmp) + return _nohz_kick_needed_hmp(rq, cpu, type); + + if (time_before(now, nohz.next_balance)) + return 0; + + return (rq->nr_running >= 2); +} + /* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. @@ -7745,12 +10221,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, int *type) { - unsigned long now = jiffies; +#ifndef CONFIG_SCHED_HMP struct sched_domain *sd; struct sched_group_capacity *sgc; - int nr_busy, cpu = rq->cpu; + int nr_busy; +#endif + int cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -7763,19 +10241,10 @@ static inline bool nohz_kick_needed(struct rq *rq) set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return false; - - if (time_before(now, nohz.next_balance)) - return false; - - if (rq->nr_running >= 2) + if (_nohz_kick_needed(rq, cpu, type)) return true; +#ifndef CONFIG_SCHED_HMP rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { @@ -7807,6 +10276,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unlock: rcu_read_unlock(); +#endif return kick; } #else @@ -7840,6 +10310,8 @@ static void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { + int type = NOHZ_KICK_ANY; + /* Don't need to rebalance while attached to NULL domain */ if (unlikely(on_null_domain(rq))) return; @@ -7847,8 +10319,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, &type)) + nohz_balancer_kick(type); #endif } @@ -8308,6 +10780,11 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, + .dec_hmp_sched_stats = dec_hmp_sched_stats_fair, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..acee1854c3d0 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true) * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, true) +SCHED_FEAT(TTWU_QUEUE, false) #ifdef HAVE_RT_PUSH_IPI /* @@ -68,4 +68,3 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) - diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..2489140a7c51 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..36c6634236fb 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -79,6 +79,26 @@ static void update_curr_idle(struct rq *rq) { } +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ +} + +#endif + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -107,4 +127,9 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_idle, + .dec_hmp_sched_stats = dec_hmp_sched_stats_idle, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle, +#endif }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..cfec881491ef 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/irq_work.h> +#include <trace/events/sched.h> int sched_rr_timeslice = RR_TIMESLICE; @@ -889,6 +890,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else + printk_deferred("%s", buf); +#endif +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -912,8 +958,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, @@ -1130,6 +1182,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) { @@ -1261,6 +1348,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1272,6 +1360,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); } @@ -1314,11 +1403,28 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int +select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + int target; + + rcu_read_lock(); + target = find_lowest_rq(p); + if (target != -1) + cpu = target; + rcu_read_unlock(); + + return cpu; +} + +static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + if (sched_enable_hmp) + return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); + /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1556,6 +1662,74 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); +#ifdef CONFIG_SCHED_HMP + +static int find_lowest_rq_hmp(struct task_struct *task) +{ + struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask); + struct cpumask candidate_mask = CPU_MASK_NONE; + struct sched_cluster *cluster; + int best_cpu = -1; + int prev_cpu = task_cpu(task); + u64 cpu_load, min_load = ULLONG_MAX; + int i; + int restrict_cluster = sched_boost() ? 0 : + sysctl_sched_restrict_cluster_spill; + + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return best_cpu; + + if (task->nr_cpus_allowed == 1) + return best_cpu; /* No other targets possible */ + + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return best_cpu; /* No targets found */ + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + */ + + for_each_sched_cluster(cluster) { + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + + if (cpumask_empty(&candidate_mask)) + continue; + + for_each_cpu(i, &candidate_mask) { + if (sched_cpu_high_irqload(i)) + continue; + + cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg; + if (!restrict_cluster) + cpu_load = scale_load_to_cpu(cpu_load, i); + + if (cpu_load < min_load || + (cpu_load == min_load && + (i == prev_cpu || (best_cpu != prev_cpu && + cpus_share_cache(prev_cpu, i))))) { + min_load = cpu_load; + best_cpu = i; + } + } + if (restrict_cluster && best_cpu != -1) + break; + } + + return best_cpu; +} + +#else /* CONFIG_SCHED_HMP */ + +static int find_lowest_rq_hmp(struct task_struct *task) +{ + return -1; +} + +#endif /* CONFIG_SCHED_HMP */ + static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; @@ -1563,6 +1737,9 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + if (sched_enable_hmp) + return find_lowest_rq_hmp(task); + /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) return -1; @@ -1780,7 +1957,9 @@ retry: } deactivate_task(rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, lowest_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; activate_task(lowest_rq, next_task, 0); ret = 1; @@ -2034,7 +2213,9 @@ static void pull_rt_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); /* * We continue with the search, just in @@ -2116,6 +2297,7 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } + #endif /* CONFIG_SMP */ /* @@ -2290,6 +2472,11 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_rt, + .dec_hmp_sched_stats = dec_hmp_sched_stats_rt, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0517abd7dd73..b9566cf3ad37 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -27,6 +27,20 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); + +struct freq_max_load_entry { + /* The maximum load which has accounted governor's headroom. */ + u64 hdemand; +}; + +struct freq_max_load { + struct rcu_head rcu; + int length; + struct freq_max_load_entry freqs[0]; +}; + +extern DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); + extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP @@ -240,6 +254,11 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; + bool notify_on_migrate; +#ifdef CONFIG_SCHED_HMP + bool upmigrate_discouraged; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -343,6 +362,82 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +struct hmp_sched_stats { + int nr_big_tasks; + u64 cumulative_runnable_avg; +#ifdef CONFIG_SCHED_FREQ_INPUT + u64 pred_demands_sum; +#endif +}; + +struct sched_cluster { + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_mitigated_freq = thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; + int dstate, dstate_wakeup_latency, dstate_wakeup_energy; + unsigned int static_cluster_pwr_cost; +}; + +extern unsigned long all_cluster_ids[]; + +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + struct sched_cluster *preferred_cluster; + struct rcu_head rcu; + u64 last_update; +#ifdef CONFIG_SCHED_FREQ_INPUT + struct group_cpu_time __percpu *cpu_time; /* one per cluster */ +#endif +}; + +struct migration_sum_data { + struct rq *src_rq, *dst_rq; +#ifdef CONFIG_SCHED_FREQ_INPUT + struct group_cpu_time *src_cpu_time, *dst_cpu_time; +#endif +}; + +extern struct list_head cluster_head; +extern int num_clusters; +extern struct sched_cluster *sched_cluster[NR_CPUS]; +extern int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand); + +struct cpu_cycle { + u64 cycles; + u64 time; +}; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +#endif + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -411,6 +506,11 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH + +#ifdef CONFIG_SCHED_HMP + struct hmp_sched_stats hmp_stats; +#endif + int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; @@ -624,6 +724,7 @@ struct rq { /* For active balancing */ int active_balance; int push_cpu; + struct task_struct *push_task; struct cpu_stop_work active_balance_work; /* cpu of this runqueue: */ int cpu; @@ -635,11 +736,41 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; + int cstate, wakeup_latency, wakeup_energy; /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_HMP + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct hmp_sched_stats hmp_stats; + + u64 window_start; + unsigned long hmp_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + unsigned int static_cpu_pwr_cost; + struct task_struct *ed_task; + struct cpu_cycle cc; + +#ifdef CONFIG_SCHED_FREQ_INPUT + u64 old_busy_time, old_busy_time_group; + int notifier_sent; + u64 old_estimated_time; +#endif +#endif + +#ifdef CONFIG_SCHED_FREQ_INPUT + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -905,6 +1036,446 @@ static inline void sched_ttwu_pending(void) { } #include "stats.h" #include "auto_group.h" +extern void init_new_task_load(struct task_struct *p); + +#ifdef CONFIG_SCHED_HMP + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +extern struct mutex policy_mutex; +extern unsigned int sched_ravg_window; +extern unsigned int sched_use_pelt; +extern unsigned int sched_disable_window_stats; +extern unsigned int sched_enable_hmp; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int pct_task_load(struct task_struct *p); +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int sched_upmigrate; +extern unsigned int sched_downmigrate; +extern unsigned int sched_init_task_load_pelt; +extern unsigned int sched_init_task_load_windows; +extern unsigned int up_down_migrate_scale_factor; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; + +#ifdef CONFIG_SCHED_FREQ_INPUT +#define MAJOR_TASK_PCT 85 +extern unsigned int sched_major_task_runtime; +#endif + +extern void reset_cpu_hmp_stats(int cpu, int reset_cra); +extern unsigned int max_task_load(void); +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); +extern void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock); + +unsigned int cpu_temp(int cpu); +int sched_set_group_id(struct task_struct *p, unsigned int group_id); +extern unsigned int nr_eligible_big_tasks(int cpu); +extern void update_up_down_migrate(void); + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline int cpu_efficiency(int cpu) +{ + return cpu_rq(cpu)->cluster->efficiency; +} + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline unsigned int cpu_min_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->min_freq; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return min(cluster->max_mitigated_freq, cluster->max_freq); +} + +static inline unsigned int cpu_max_freq(int cpu) +{ + return cluster_max_freq(cpu_rq(cpu)->cluster); +} + +static inline unsigned int cpu_max_possible_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_freq; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + +static inline int cpu_max_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->max_power_cost; +} + +static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period) +{ + return div64_u64(cycles, period); +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +/* + * 'load' is in reference to "best cpu" at its best frequency. + * Scale that in reference to a given cpu, accounting for how bad it is + * in reference to "best cpu". + */ +static inline u64 scale_load_to_cpu(u64 task_load, int cpu) +{ + u64 lsf = cpu_load_scale_factor(cpu); + + if (lsf != 1024) { + task_load *= lsf; + task_load /= 1024; + } + + return task_load; +} + +static inline unsigned int task_load(struct task_struct *p) +{ + if (sched_use_pelt) + return p->se.avg.runnable_avg_sum_scaled; + + return p->ravg.demand; +} + +#ifdef CONFIG_SCHED_FREQ_INPUT +#define set_pred_demands_sum(stats, x) ((stats)->pred_demands_sum = (x)) +#define verify_pred_demands_sum(stat) BUG_ON((s64)(stat)->pred_demands_sum < 0) +#else +#define set_pred_demands_sum(stats, x) +#define verify_pred_demands_sum(stat) +#endif + +static inline void +inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled : + (sched_disable_window_stats ? 0 : p->ravg.demand); + + stats->cumulative_runnable_avg += task_load; + set_pred_demands_sum(stats, stats->pred_demands_sum + + p->ravg.pred_demand); +} + +static inline void +dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled : + (sched_disable_window_stats ? 0 : p->ravg.demand); + + stats->cumulative_runnable_avg -= task_load; + + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + set_pred_demands_sum(stats, stats->pred_demands_sum - + p->ravg.pred_demand); + verify_pred_demands_sum(stats); +} + +static inline void +fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p, s64 task_load_delta, + s64 pred_demand_delta) +{ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg += task_load_delta; + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + set_pred_demands_sum(stats, stats->pred_demands_sum + + pred_demand_delta); + verify_pred_demands_sum(stats); +} + + +#define pct_to_real(tunable) \ + (div64_u64((u64)tunable * (u64)max_task_load(), 100)) + +#define real_to_pct(tunable) \ + (div64_u64((u64)tunable * (u64)100, (u64)max_task_load())) + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +#else /* CONFIG_SCHED_HMP */ + +#define sched_use_pelt 0 + +struct hmp_sched_stats; +struct related_thread_group; + +static inline u64 scale_load_to_cpu(u64 load, int cpu) +{ + return load; +} + +static inline unsigned int nr_eligible_big_tasks(int cpu) +{ + return 0; +} + +static inline int pct_task_load(struct task_struct *p) { return 0; } + +static inline int cpu_capacity(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } + +static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ +} + +static inline void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock) +{ +} + +static inline int sched_cpu_high_irqload(int cpu) { return 0; } + +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + +/* + * Returns the rq capacity of any rq in a group. This does not play + * well with groups where rq capacity can change independently. + */ +#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) + +#ifdef CONFIG_SCHED_FREQ_INPUT +#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand) + +extern void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 window_start; +}; + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +#else /* CONFIG_SCHED_FREQ_INPUT */ + +#define sched_migration_fixup 0 +#define PRED_DEMAND_DELTA (0) + +static inline void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +#endif /* CONFIG_SCHED_FREQ_INPUT */ + +#ifdef CONFIG_SCHED_HMP + +#define BOOST_KICK 0 +#define CPU_RESERVED 1 + +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + /* Name boost_flags as hmp_flags? */ + return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline u64 cpu_cravg_sync(int cpu, int sync) +{ + struct rq *rq = cpu_rq(cpu); + u64 load; + + load = rq->hmp_stats.cumulative_runnable_avg; + + /* + * If load is being checked in a sync wakeup environment, + * we may want to discount the load of the currently running + * task. + */ + if (sync && cpu == smp_processor_id()) { + if (load > rq->curr->ravg.demand) + load -= rq->curr->ravg.demand; + else + load = 0; + } + + return load; +} + +extern void check_for_migration(struct rq *rq, struct task_struct *p); +extern void pre_big_task_count_change(const struct cpumask *cpus); +extern void post_big_task_count_change(const struct cpumask *cpus); +extern void set_hmp_defaults(void); +extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); +extern unsigned int power_cost(int cpu, u64 demand); +extern void reset_all_window_stats(u64 window_start, unsigned int window_size); +extern void boost_kick(int cpu); +extern int sched_boost(void); + +#else /* CONFIG_SCHED_HMP */ + +#define sched_enable_hmp 0 +#define sched_freq_legacy_mode 1 + +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } +static inline void pre_big_task_count_change(void) { } +static inline void post_big_task_count_change(void) { } +static inline void set_hmp_defaults(void) { } + +static inline void clear_reserved(int cpu) { } + +#define trace_sched_cpu_load(...) +#define trace_sched_cpu_load_lb(...) +#define trace_sched_cpu_load_cgroup(...) +#define trace_sched_cpu_load_wakeup(...) + +#endif /* CONFIG_SCHED_HMP */ + #ifdef CONFIG_CGROUP_SCHED /* @@ -925,6 +1496,11 @@ static inline struct task_group *task_group(struct task_struct *p) return p->sched_task_group; } +static inline bool task_notify_on_migrate(struct task_struct *p) +{ + return task_group(p)->notify_on_migrate; +} + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { @@ -950,7 +1526,10 @@ static inline struct task_group *task_group(struct task_struct *p) { return NULL; } - +static inline bool task_notify_on_migrate(struct task_struct *p) +{ + return false; +} #endif /* CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) @@ -1100,6 +1679,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_NO_NOTIFIER 0x08 /* do not notify governor */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1228,6 +1808,12 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p); #endif +#ifdef CONFIG_SCHED_HMP + void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand); +#endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -1288,7 +1874,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) } #endif +#ifdef CONFIG_SYSRQ_SCHED_DEBUG extern void sysrq_sched_debug_show(void); +#endif extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1314,6 +1902,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; + sched_update_nr_prod(cpu_of(rq), count, true); rq->nr_running = prev_nr + count; if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1340,6 +1929,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { + sched_update_nr_prod(cpu_of(rq), count, false); rq->nr_running -= count; } @@ -1719,6 +2309,9 @@ enum rq_nohz_flag_bits { NOHZ_BALANCE_KICK, }; +#define NOHZ_KICK_ANY 0 +#define NOHZ_KICK_RESTRICT 1 + #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 000000000000..c70e0466c36c --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,128 @@ +/* Copyright (c) 2012, 2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/sched.h> +#include <linux/math64.h> + +#include "sched.h" +#include <trace/events/sched.h> + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 diff = curr_time - last_get_time; + u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0; + + *avg = 0; + *iowait_avg = 0; + *big_avg = 0; + + if (!diff) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + tmp_avg += per_cpu(nr_prod_sum, cpu); + tmp_avg += per_cpu(nr, cpu) * + (curr_time - per_cpu(last_time, cpu)); + + tmp_big_avg += per_cpu(nr_big_prod_sum, cpu); + tmp_big_avg += nr_eligible_big_tasks(cpu) * + (curr_time - per_cpu(last_time, cpu)); + + tmp_iowait += per_cpu(iowait_prod_sum, cpu); + tmp_iowait += nr_iowait_cpu(cpu) * + (curr_time - per_cpu(last_time, cpu)); + + per_cpu(last_time, cpu) = curr_time; + + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + diff = curr_time - last_get_time; + last_get_time = curr_time; + + *avg = (int)div64_u64(tmp_avg * 100, diff); + *big_avg = (int)div64_u64(tmp_big_avg * 100, diff); + *iowait_avg = (int)div64_u64(tmp_iowait * 100, diff); + + trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg); + + BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0); + pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n", + __func__, *avg, *big_avg, *iowait_avg); +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + int diff; + s64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..134da1cc8fce 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -17,6 +17,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static void check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -42,12 +77,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + inc_hmp_sched_stats_stop(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + dec_hmp_sched_stats_stop(rq, p); } static void yield_task_stop(struct rq *rq) @@ -134,4 +171,9 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_stop, + .dec_hmp_sched_stats = dec_hmp_sched_stats_stop, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop, +#endif }; diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..abdc48cd79a3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -32,6 +32,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); +/* CPU mask indicating which CPUs to bring online during smp_init() */ +static bool have_boot_cpu_mask; +static cpumask_var_t boot_cpu_mask; static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -548,6 +551,19 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); +static int __init boot_cpus(char *str) +{ + alloc_bootmem_cpumask_var(&boot_cpu_mask); + if (cpulist_parse(str, boot_cpu_mask) < 0) { + pr_warn("SMP: Incorrect boot_cpus cpumask\n"); + return -EINVAL; + } + have_boot_cpu_mask = true; + return 0; +} + +early_param("boot_cpus", boot_cpus); + /* Setup number of possible processor ids */ int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); @@ -563,6 +579,21 @@ void __weak smp_announce(void) printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); } +/* Should the given CPU be booted during smp_init() ? */ +static inline bool boot_cpu(int cpu) +{ + if (!have_boot_cpu_mask) + return true; + + return cpumask_test_cpu(cpu, boot_cpu_mask); +} + +static inline void free_boot_cpu_mask(void) +{ + if (have_boot_cpu_mask) /* Allocated from boot_cpus() */ + free_bootmem_cpumask_var(boot_cpu_mask); +} + /* Called by boot processor to activate the rest. */ void __init smp_init(void) { @@ -574,10 +605,12 @@ void __init smp_init(void) for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) break; - if (!cpu_online(cpu)) + if (!cpu_online(cpu) && boot_cpu(cpu)) cpu_up(cpu); } + free_boot_cpu_mask(); + /* Any cleanup work */ smp_announce(); smp_cpus_done(setup_max_cpus); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..6949476a118f 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/kthread.h> #include <linux/smpboot.h> +#include <linux/kmemleak.h> #include "smpboot.h" @@ -177,6 +178,8 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); if (!td) return -ENOMEM; + + kmemleak_not_leak(td); td->cpu = cpu; td->ht = ht; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..b5a8e844a968 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -41,6 +41,8 @@ #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> +#include <linux/mm.h> +#include <linux/mempolicy.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -2072,10 +2074,158 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +#ifdef CONFIG_MMU +static int prctl_update_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + const char __user *name_addr) +{ + struct mm_struct *mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + + if (name_addr == vma_get_anon_name(vma)) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, name_addr); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + if (!vma->vm_file) + vma->anon_name = name_addr; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +static int prctl_set_vma_anon_name(unsigned long start, unsigned long end, + unsigned long arg) +{ + unsigned long tmp; + struct vm_area_struct *vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - this matches the handling in madvise. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + return error; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + return error; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = prctl_update_vma_anon_name(vma, &prev, start, tmp, + (const char __user *)arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + return error; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +} + +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long len_in, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + int error; + unsigned long len; + unsigned long end; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + down_write(&mm->mmap_sem); + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + error = prctl_set_vma_anon_name(start, end, arg); + break; + default: + error = -EINVAL; + } + + up_write(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_MMU */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long len_in, unsigned long arg) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; + struct task_struct *tsk; unsigned char comm[sizeof(me->comm)]; long error; @@ -2218,6 +2368,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_TID_ADDRESS: error = prctl_get_tid_address(me, (int __user **)arg2); break; + case PR_SET_TIMERSLACK_PID: + if (task_pid_vnr(current) != (pid_t)arg3 && + !capable(CAP_SYS_NICE)) + return -EPERM; + rcu_read_lock(); + tsk = find_task_by_vpid((pid_t)arg3); + if (tsk == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + get_task_struct(tsk); + rcu_read_unlock(); + if (arg2 <= 0) + tsk->timer_slack_ns = + tsk->default_timer_slack_ns; + else + tsk->timer_slack_ns = arg2; + put_task_struct(tsk); + error = 0; + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; break; @@ -2266,6 +2436,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..81fbed978da3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -104,6 +104,7 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; +extern int extra_free_kbytes; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; extern int compat_log; @@ -284,6 +285,167 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_wake_to_idle", + .data = &sysctl_sched_wake_to_idle, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_wakeup_load_threshold", + .data = &sysctl_sched_wakeup_load_threshold, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_FREQ_INPUT + { + .procname = "sched_freq_inc_notify", + .data = &sysctl_sched_freq_inc_notify, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_freq_dec_notify", + .data = &sysctl_sched_freq_dec_notify, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_SCHED_HMP + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_ravg_hist_size", + .data = &sysctl_sched_ravg_hist_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, + { + .procname = "sched_window_stats_policy", + .data = &sysctl_sched_window_stats_policy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, + { + .procname = "sched_spill_load", + .data = &sysctl_sched_spill_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_spill_nr_run", + .data = &sysctl_sched_spill_nr_run, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_upmigrate", + .data = &sysctl_sched_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_downmigrate", + .data = &sysctl_sched_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_init_task_load", + .data = &sysctl_sched_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_select_prev_cpu_us", + .data = &sysctl_sched_select_prev_cpu_us, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_enable_colocation", + .data = &sysctl_sched_enable_colocation, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "sched_restrict_cluster_spill", + .data = &sysctl_sched_restrict_cluster_spill, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "sched_small_wakee_task_load", + .data = &sysctl_sched_small_wakee_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, + { + .procname = "sched_big_waker_task_load", + .data = &sysctl_sched_big_waker_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + }, +#ifdef CONFIG_SCHED_FREQ_INPUT + { + .procname = "sched_new_task_windows", + .data = &sysctl_sched_new_task_windows, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, + { + .procname = "sched_pred_alert_freq", + .data = &sysctl_sched_pred_alert_freq, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_freq_aggregate", + .data = &sysctl_sched_freq_aggregate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, +#endif + { + .procname = "sched_boost", + .data = &sysctl_sched_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_boost_handler, + }, +#endif /* CONFIG_SCHED_HMP */ #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", @@ -1172,6 +1334,27 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + { + .procname = "boot_reason", + .data = &boot_reason, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + + { + .procname = "cold_boot", + .data = &cold_boot, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { } }; @@ -1393,6 +1576,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), @@ -1568,6 +1759,44 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +#ifdef CONFIG_SWAP + { + .procname = "swap_ratio", + .data = &sysctl_swap_ratio, + .maxlen = sizeof(sysctl_swap_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, + { + .procname = "swap_ratio_enable", + .data = &sysctl_swap_ratio_enable, + .maxlen = sizeof(sysctl_swap_ratio_enable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, +#endif { } }; @@ -2015,15 +2244,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - *valp = -*lvalp; - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - *valp = *lvalp; - } + *valp = *negp ? -*lvalp : *lvalp; } else { int val = *valp; if (val < 0) { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..4a816bab38a2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -138,6 +138,8 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_INT, KERN_BOOT_REASON, "boot_reason" }, + { CTL_INT, KERN_COLD_BOOT, "cold_boot" }, {} }; @@ -523,6 +525,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PREFIX_ROUTE, "accept_ra_prefix_route" }, {} }; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 49eca0beed32..5819ca07a22b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -12,3 +12,5 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o + +ccflags-y += -Idrivers/cpuidle diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7fbba635a549..0cdc34ebd8d1 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,11 @@ #include <linux/workqueue.h> #include <linux/freezer.h> +#ifdef CONFIG_MSM_PM +#include "lpm-levels.h" +#endif +#include <linux/workqueue.h> + /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base @@ -46,14 +51,130 @@ static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); static struct wakeup_source *ws; +static struct delayed_work work; +static struct workqueue_struct *power_off_alarm_workqueue; #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); +static struct mutex power_on_alarm_lock; +static struct alarm init_alarm; /** + * power_on_alarm_init - Init power on alarm value + * + * Read rtc alarm value after device booting up and add this alarm + * into alarm queue. + */ +void power_on_alarm_init(void) +{ + struct rtc_wkalrm rtc_alarm; + struct rtc_time rt; + unsigned long alarm_time; + struct rtc_device *rtc; + ktime_t alarm_ktime; + + rtc = alarmtimer_get_rtcdev(); + + if (!rtc) + return; + + rtc_read_alarm(rtc, &rtc_alarm); + rt = rtc_alarm.time; + + rtc_tm_to_time(&rt, &alarm_time); + + if (alarm_time) { + alarm_ktime = ktime_set(alarm_time, 0); + alarm_init(&init_alarm, ALARM_POWEROFF_REALTIME, NULL); + alarm_start(&init_alarm, alarm_ktime); + } +} + +/** + * set_power_on_alarm - set power on alarm value into rtc register + * + * Get the soonest power off alarm timer and set the alarm value into rtc + * register. + */ +void set_power_on_alarm(void) +{ + int rc; + struct timespec wall_time, alarm_ts; + long alarm_secs = 0l; + long rtc_secs, alarm_time, alarm_delta; + struct rtc_time rtc_time; + struct rtc_wkalrm alarm; + struct rtc_device *rtc; + struct timerqueue_node *next; + unsigned long flags; + struct alarm_base *base = &alarm_bases[ALARM_POWEROFF_REALTIME]; + + rc = mutex_lock_interruptible(&power_on_alarm_lock); + if (rc != 0) + return; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + + if (next) { + alarm_ts = ktime_to_timespec(next->expires); + alarm_secs = alarm_ts.tv_sec; + } + + if (!alarm_secs) + goto disable_alarm; + + getnstimeofday(&wall_time); + + /* + * alarm_secs have to be bigger than "wall_time +1". + * It is to make sure that alarm time will be always + * bigger than wall time. + */ + if (alarm_secs <= wall_time.tv_sec + 1) + goto disable_alarm; + + rtc = alarmtimer_get_rtcdev(); + if (!rtc) + goto exit; + + rtc_read_time(rtc, &rtc_time); + rtc_tm_to_time(&rtc_time, &rtc_secs); + alarm_delta = wall_time.tv_sec - rtc_secs; + alarm_time = alarm_secs - alarm_delta; + + rtc_time_to_tm(alarm_time, &alarm.time); + alarm.enabled = 1; + rc = rtc_set_alarm(rtcdev, &alarm); + if (rc) + goto disable_alarm; + + mutex_unlock(&power_on_alarm_lock); + return; + +disable_alarm: + rtc_alarm_irq_enable(rtcdev, 0); +exit: + mutex_unlock(&power_on_alarm_lock); +} + +static void alarmtimer_triggered_func(void *p) +{ + struct rtc_device *rtc = rtcdev; + + if (!(rtc->irq_data & RTC_AF)) + return; + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); +} + +static struct rtc_task alarmtimer_rtc_task = { + .func = alarmtimer_triggered_func +}; +/** * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. @@ -63,7 +184,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; - struct rtc_device *ret; + struct rtc_device *ret = NULL; spin_lock_irqsave(&rtcdev_lock, flags); ret = rtcdev; @@ -77,33 +198,48 @@ static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) { unsigned long flags; + int err = 0; struct rtc_device *rtc = to_rtc_device(dev); - if (rtcdev) return -EBUSY; - if (!rtc->ops->set_alarm) return -1; - if (!device_may_wakeup(rtc->dev.parent)) - return -1; spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { + err = rtc_irq_register(rtc, &alarmtimer_rtc_task); + if (err) + goto rtc_irq_reg_err; rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); } + +rtc_irq_reg_err: spin_unlock_irqrestore(&rtcdev_lock, flags); - return 0; + return err; + +} + +static void alarmtimer_rtc_remove_device(struct device *dev, + struct class_interface *class_intf) +{ + if (rtcdev && dev == &rtcdev->dev) { + rtc_irq_unregister(rtcdev, &alarmtimer_rtc_task); + rtcdev = NULL; + } } static inline void alarmtimer_rtc_timer_init(void) { + mutex_init(&power_on_alarm_lock); + rtc_timer_init(&rtctimer, NULL, NULL); } static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, + .remove_dev = &alarmtimer_rtc_remove_device, }; static int alarmtimer_rtc_interface_setup(void) @@ -124,8 +260,14 @@ struct rtc_device *alarmtimer_get_rtcdev(void) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } +void set_power_on_alarm(void) { } #endif +static void alarm_work_func(struct work_struct *unused) +{ + set_power_on_alarm(); +} + /** * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue * @base: pointer to the base where the timer is being run @@ -195,6 +337,10 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); + /* set next power off alarm */ + if (alarm->type == ALARM_POWEROFF_REALTIME) + queue_delayed_work(power_off_alarm_workqueue, &work, 0); + return ret; } @@ -217,6 +363,70 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); * set an rtc timer to fire that far into the future, which * will wake us from suspend. */ +#if defined(CONFIG_RTC_DRV_QPNP) && defined(CONFIG_MSM_PM) +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + struct rtc_device *rtc; + int i; + int ret = 0; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + /* Setup a timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + if (poweron_alarm) { + struct rtc_time tm_val; + unsigned long secs; + + tm_val = rtc_ktime_to_tm(min); + rtc_tm_to_time(&tm_val, &secs); + lpm_suspend_wake_time(secs); + } else { + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + } + return ret; +} +#else static int alarmtimer_suspend(struct device *dev) { struct rtc_time tm; @@ -226,6 +436,8 @@ static int alarmtimer_suspend(struct device *dev) int i; int ret; + cancel_delayed_work_sync(&work); + spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; freezer_delta = ktime_set(0, 0); @@ -271,11 +483,31 @@ static int alarmtimer_suspend(struct device *dev) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; } +#endif +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + rtc_timer_cancel(rtc, &rtctimer); + + queue_delayed_work(power_off_alarm_workqueue, &work, 0); + return 0; +} + #else static int alarmtimer_suspend(struct device *dev) { return 0; } + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} #endif static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) @@ -443,12 +675,14 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); * clock2alarm - helper that converts from clockid to alarmtypes * @clockid: clockid. */ -static enum alarmtimer_type clock2alarm(clockid_t clockid) +enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; if (clockid == CLOCK_BOOTTIME_ALARM) return ALARM_BOOTTIME; + if (clockid == CLOCK_POWEROFF_ALARM) + return ALARM_POWEROFF_REALTIME; return -1; } @@ -800,6 +1034,7 @@ out: /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { @@ -834,10 +1069,13 @@ static int __init alarmtimer_init(void) posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_POWEROFF_ALARM, &alarm_clock); /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_POWEROFF_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_POWEROFF_REALTIME].gettime = &ktime_get_real; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; for (i = 0; i < ALARM_NUMTYPE; i++) { @@ -859,8 +1097,24 @@ static int __init alarmtimer_init(void) goto out_drv; } ws = wakeup_source_register("alarmtimer"); - return 0; + if (!ws) { + error = -ENOMEM; + goto out_ws; + } + + INIT_DELAYED_WORK(&work, alarm_work_func); + power_off_alarm_workqueue = + create_singlethread_workqueue("power_off_alarm"); + if (!power_off_alarm_workqueue) { + error = -ENOMEM; + goto out_wq; + } + return 0; +out_wq: + wakeup_source_unregister(ws); +out_ws: + platform_device_unregister(pdev); out_drv: platform_driver_unregister(&alarmtimer_driver); out_if: diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a26036d37a38..0637823aa5a6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,6 +70,7 @@ struct clock_data { static struct hrtimer sched_clock_timer; static int irqtime = -1; +static int initialized; core_param(irqtime, irqtime, int, 0400); @@ -231,6 +232,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } +int sched_clock_initialized(void) +{ + return initialized; +} + void __init sched_clock_postinit(void) { /* @@ -249,6 +255,8 @@ void __init sched_clock_postinit(void) hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sched_clock_timer.function = sched_clock_poll; hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + + initialized = 1; } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 22c57e191a23..651ff1a3a306 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -24,6 +24,7 @@ #include <linux/posix-timers.h> #include <linux/perf_event.h> #include <linux/context_tracking.h> +#include <linux/rq_stats.h> #include <asm/irq_regs.h> @@ -31,6 +32,10 @@ #include <trace/events/timer.h> +struct rq_data rq_info; +struct workqueue_struct *rq_wq; +spinlock_t rq_lock; + /* * Per cpu nohz control structure */ @@ -41,6 +46,21 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); */ static ktime_t last_jiffies_update; +u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns) +{ + u64 cur_jiffies; + unsigned long seq; + + do { + seq = read_seqbegin(&jiffies_lock); + *now = ktime_get_ns(); + *jiffy_ktime_ns = ktime_to_ns(last_jiffies_update); + cur_jiffies = get_jiffies_64(); + } while (read_seqretry(&jiffies_lock, seq)); + + return cur_jiffies; +} + struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); @@ -143,7 +163,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -430,7 +450,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -701,7 +721,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) update_cpu_load_nohz(); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ @@ -1049,6 +1069,51 @@ void tick_irq_enter(void) * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS +static void update_rq_stats(void) +{ + unsigned long jiffy_gap = 0; + unsigned int rq_avg = 0; + unsigned long flags = 0; + + jiffy_gap = jiffies - rq_info.rq_poll_last_jiffy; + + if (jiffy_gap >= rq_info.rq_poll_jiffies) { + + spin_lock_irqsave(&rq_lock, flags); + + if (!rq_info.rq_avg) + rq_info.rq_poll_total_jiffies = 0; + + rq_avg = nr_running() * 10; + + if (rq_info.rq_poll_total_jiffies) { + rq_avg = (rq_avg * jiffy_gap) + + (rq_info.rq_avg * + rq_info.rq_poll_total_jiffies); + do_div(rq_avg, + rq_info.rq_poll_total_jiffies + jiffy_gap); + } + + rq_info.rq_avg = rq_avg; + rq_info.rq_poll_total_jiffies += jiffy_gap; + rq_info.rq_poll_last_jiffy = jiffies; + + spin_unlock_irqrestore(&rq_lock, flags); + } +} + +static void wakeup_user(void) +{ + unsigned long jiffy_gap; + + jiffy_gap = jiffies - rq_info.def_timer_last_jiffy; + + if (jiffy_gap >= rq_info.def_timer_jiffies) { + rq_info.def_timer_last_jiffy = jiffies; + queue_work(rq_wq, &rq_info.def_timer_work); + } +} + /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. @@ -1066,9 +1131,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) + if (regs) { tick_sched_handle(ts, regs); + if (rq_info.init == 1 && + tick_do_timer_cpu == smp_processor_id()) { + /* + * update run queue statistics + */ + update_rq_stats(); + + /* + * wakeup user if needed + */ + wakeup_user(); + } + } + /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; @@ -1181,3 +1260,8 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } + +ktime_t * get_next_event_cpu(unsigned int cpu) +{ + return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event); +} diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..51896272fcde 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -94,12 +94,15 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; +static inline void __run_timers(struct tvec_base *base); static DEFINE_PER_CPU(struct tvec_base, tvec_bases); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; +struct tvec_base tvec_base_deferrable; + void timers_update_migration(bool update_nohz) { bool on = sysctl_timer_migration && tick_nohz_active; @@ -135,18 +138,62 @@ int timer_migration_handler(struct ctl_table *table, int write, } static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) + int pinned, u32 timer_flags) { + if (!pinned && !(timer_flags & TIMER_PINNED_ON_CPU) && + (timer_flags & TIMER_DEFERRABLE)) + return &tvec_base_deferrable; if (pinned || !base->migration_enabled) return this_cpu_ptr(&tvec_bases); return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); } + +static inline void __run_deferrable_timers(void) +{ + if (smp_processor_id() == tick_do_timer_cpu && + time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies)) + __run_timers(&tvec_base_deferrable); +} + +static inline void init_timer_deferrable_global(void) +{ + tvec_base_deferrable.cpu = nr_cpu_ids; + spin_lock_init(&tvec_base_deferrable.lock); + tvec_base_deferrable.timer_jiffies = jiffies; + tvec_base_deferrable.next_timer = tvec_base_deferrable.timer_jiffies; +} + +static inline struct tvec_base *get_timer_base(u32 timer_flags) +{ + if (!(timer_flags & TIMER_PINNED_ON_CPU) && + timer_flags & TIMER_DEFERRABLE) + return &tvec_base_deferrable; + else + return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK); +} #else static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) + int pinned, u32 timer_flags) { return this_cpu_ptr(&tvec_bases); } + +static inline void __run_deferrable_timers(void) +{ +} + +static inline void init_timer_deferrable_global(void) +{ + /* + * initialize cpu unbound deferrable timer base only when CONFIG_SMP. + * UP kernel handles the timers with cpu 0 timer base. + */ +} + +static inline struct tvec_base *get_timer_base(u32 timer_flags) +{ + return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK); +} #endif static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -768,7 +815,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, struct tvec_base *base; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + base = get_timer_base(tf); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -797,7 +844,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned); + new_base = get_target_base(base, pinned, timer->flags); if (base != new_base) { /* @@ -819,6 +866,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } } + if (pinned == TIMER_PINNED) + timer->flags |= TIMER_PINNED_ON_CPU; + else + timer->flags &= ~TIMER_PINNED_ON_CPU; timer->expires = expires; internal_add_timer(base, timer); @@ -1000,6 +1051,7 @@ void add_timer_on(struct timer_list *timer, int cpu) (timer->flags & ~TIMER_BASEMASK) | cpu); } + timer->flags |= TIMER_PINNED_ON_CPU; debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1433,6 +1485,8 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = this_cpu_ptr(&tvec_bases); + __run_deferrable_timers(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1656,6 +1710,8 @@ static void __init init_timer_cpus(void) for_each_possible_cpu(cpu) init_timer_cpu(cpu); + + init_timer_deferrable_global(); } void __init init_timers(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..048bf074bef9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,6 +77,9 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config GPU_TRACEPOINTS + bool + config CONTEXT_SWITCH_TRACER bool @@ -86,6 +89,31 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config IPC_LOGGING + bool "Debug Logging for IPC Drivers" + select GENERIC_TRACER + help + This option allows the debug logging for IPC Drivers. + + If in doubt, say no. + +config QCOM_RTB + bool "Register tracing" + help + Add support for logging different events to a small uncached + region. This is designed to aid in debugging reset cases where the + caches may not be flushed before the target resets. + +config QCOM_RTB_SEPARATE_CPUS + bool "Separate entries for each cpu" + depends on QCOM_RTB + depends on SMP + help + Under some circumstances, it may be beneficial to give dedicated space + for each cpu to log accesses. Selecting this option will log each cpu + separately. This will guarantee that the last acesses for each cpu + will be logged but there will be fewer entries per cpu + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -485,6 +513,19 @@ config FUNCTION_PROFILER If in doubt, say N. +config CPU_FREQ_SWITCH_PROFILER + bool "CPU frequency switch time profiler" + select GENERIC_TRACER + help + This option enables the CPU frequency switch profiler. A file is + created in debugfs called "cpu_freq_switch_profile_enabled", which + defaults to zero. When a 1 is echoed into this file, profiling begins. + When a zero is echoed, profiling stops. A "cpu_freq_switch" file is + also created in the trace_stats directory; this file shows the + switches that have occurred and duration statistics. + + If in doubt, say N. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..2acad4b6a92a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_CPU_FREQ_SWITCH_PROFILER) += trace_cpu_freq_switch.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o @@ -64,7 +65,13 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o +obj-$(CONFIG_QCOM_RTB) += msm_rtb.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_IPC_LOGGING) += ipc_logging.o +ifdef CONFIG_DEBUG_FS +obj-$(CONFIG_IPC_LOGGING) += ipc_logging_debug.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a990824c8604..7b6127653a37 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -199,9 +199,9 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) + int rw, u32 what, int error, int pdu_len, + void *pdu_data, struct task_struct *tsk) { - struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct ring_buffer *buffer = NULL; struct blk_io_trace *t; @@ -708,18 +708,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, unsigned int nr_bytes, u32 what) { struct blk_trace *bt = q->blk_trace; + struct task_struct *tsk = current; if (likely(!bt)) return; + /* + * Use the bio context for all events except ISSUE and + * COMPLETE events. + * + * Not all the pages in the bio are dirtied by the same task but + * most likely it will be, since the sectors accessed on the device + * must be adjacent. + */ + if (!((what == BLK_TA_ISSUE) || (what == BLK_TA_COMPLETE)) && + bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); + what, rq->errors, rq->cmd_len, rq->cmd, tsk); } else { what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, - rq->cmd_flags, what, rq->errors, 0, NULL); + rq->cmd_flags, what, rq->errors, 0, NULL, tsk); } } @@ -771,12 +786,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { struct blk_trace *bt = q->blk_trace; + struct task_struct *tsk = current; if (likely(!bt)) return; + /* + * Not all the pages in the bio are dirtied by the same task but + * most likely it will be, since the sectors accessed on the device + * must be adjacent. + */ + if (bio_has_data(bio) && bio->bi_io_vec && bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, what, error, 0, NULL); + bio->bi_rw, what, error, 0, NULL, tsk); } static void blk_add_trace_bio_bounce(void *ignore, @@ -824,7 +849,8 @@ static void blk_add_trace_getrq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, + NULL, current); } } @@ -840,7 +866,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, current); } } @@ -849,7 +875,8 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, + current); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -866,7 +893,8 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, + current); } } @@ -875,13 +903,19 @@ static void blk_add_trace_split(void *ignore, unsigned int pdu) { struct blk_trace *bt = q->blk_trace; + struct task_struct *tsk = current; if (bt) { __be64 rpdu = cpu_to_be64(pdu); + if (bio_has_data(bio) && bio->bi_io_vec && + bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - bio->bi_error, sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu, tsk); } } @@ -904,6 +938,7 @@ static void blk_add_trace_bio_remap(void *ignore, { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; + struct task_struct *tsk = current; if (likely(!bt)) return; @@ -912,9 +947,14 @@ static void blk_add_trace_bio_remap(void *ignore, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); + if (bio_has_data(bio) && bio->bi_io_vec && + bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_REMAP, bio->bi_error, - sizeof(r), &r); + sizeof(r), &r, tsk); } /** @@ -937,6 +977,7 @@ static void blk_add_trace_rq_remap(void *ignore, { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; + struct task_struct *tsk = current; if (likely(!bt)) return; @@ -945,9 +986,14 @@ static void blk_add_trace_rq_remap(void *ignore, r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); r.sector_from = cpu_to_be64(from); + if (bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, - sizeof(r), &r); + sizeof(r), &r, tsk); } /** @@ -966,16 +1012,22 @@ void blk_add_driver_data(struct request_queue *q, void *data, size_t len) { struct blk_trace *bt = q->blk_trace; + struct task_struct *tsk = current; if (likely(!bt)) return; + if (bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, rq->errors, len, data, tsk); else __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, rq->errors, len, data, tsk); } EXPORT_SYMBOL_GPL(blk_add_driver_data); diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c new file mode 100644 index 000000000000..a4b3f00faee3 --- /dev/null +++ b/kernel/trace/gpu-traces.c @@ -0,0 +1,23 @@ +/* + * GPU tracepoints + * + * Copyright (C) 2013 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/gpu.h> + +EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch); +EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue); diff --git a/kernel/trace/ipc_logging.c b/kernel/trace/ipc_logging.c new file mode 100644 index 000000000000..2c3e0998d400 --- /dev/null +++ b/kernel/trace/ipc_logging.c @@ -0,0 +1,876 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <asm/arch_timer.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/delay.h> +#include <linux/completion.h> +#include <linux/ipc_logging.h> + +#include "ipc_logging_private.h" + +#define LOG_PAGE_DATA_SIZE sizeof(((struct ipc_log_page *)0)->data) +#define LOG_PAGE_FLAG (1 << 31) + +static LIST_HEAD(ipc_log_context_list); +static DEFINE_RWLOCK(context_list_lock_lha1); +static void *get_deserialization_func(struct ipc_log_context *ilctxt, + int type); + +static struct ipc_log_page *get_first_page(struct ipc_log_context *ilctxt) +{ + struct ipc_log_page_header *p_pghdr; + struct ipc_log_page *pg = NULL; + + if (!ilctxt) + return NULL; + p_pghdr = list_first_entry(&ilctxt->page_list, + struct ipc_log_page_header, list); + pg = container_of(p_pghdr, struct ipc_log_page, hdr); + return pg; +} + +/** + * is_nd_read_empty - Returns true if no data is available to read in log + * + * @ilctxt: logging context + * @returns: > 1 if context is empty; 0 if not empty; <0 for failure + * + * This is for the debugfs read pointer which allows for a non-destructive read. + * There may still be data in the log, but it may have already been read. + */ +static int is_nd_read_empty(struct ipc_log_context *ilctxt) +{ + if (!ilctxt) + return -EINVAL; + + return ((ilctxt->nd_read_page == ilctxt->write_page) && + (ilctxt->nd_read_page->hdr.nd_read_offset == + ilctxt->write_page->hdr.write_offset)); +} + +/** + * is_read_empty - Returns true if no data is available in log + * + * @ilctxt: logging context + * @returns: > 1 if context is empty; 0 if not empty; <0 for failure + * + * This is for the actual log contents. If it is empty, then there + * is no data at all in the log. + */ +static int is_read_empty(struct ipc_log_context *ilctxt) +{ + if (!ilctxt) + return -EINVAL; + + return ((ilctxt->read_page == ilctxt->write_page) && + (ilctxt->read_page->hdr.read_offset == + ilctxt->write_page->hdr.write_offset)); +} + +/** + * is_nd_read_equal_read - Return true if the non-destructive read is equal to + * the destructive read + * + * @ilctxt: logging context + * @returns: true if nd read is equal to read; false otherwise + */ +static bool is_nd_read_equal_read(struct ipc_log_context *ilctxt) +{ + uint16_t read_offset; + uint16_t nd_read_offset; + + if (ilctxt->nd_read_page == ilctxt->read_page) { + read_offset = ilctxt->read_page->hdr.read_offset; + nd_read_offset = ilctxt->nd_read_page->hdr.nd_read_offset; + + if (read_offset == nd_read_offset) + return true; + } + + return false; +} + + +static struct ipc_log_page *get_next_page(struct ipc_log_context *ilctxt, + struct ipc_log_page *cur_pg) +{ + struct ipc_log_page_header *p_pghdr; + struct ipc_log_page *pg = NULL; + + if (!ilctxt || !cur_pg) + return NULL; + + if (ilctxt->last_page == cur_pg) + return ilctxt->first_page; + + p_pghdr = list_first_entry(&cur_pg->hdr.list, + struct ipc_log_page_header, list); + pg = container_of(p_pghdr, struct ipc_log_page, hdr); + + return pg; +} + +/** + * ipc_log_read - do non-destructive read of the log + * + * @ilctxt: Logging context + * @data: Data pointer to receive the data + * @data_size: Number of bytes to read (must be <= bytes available in log) + * + * This read will update a runtime read pointer, but will not affect the actual + * contents of the log which allows for reading the logs continuously while + * debugging and if the system crashes, then the full logs can still be + * extracted. + */ +static void ipc_log_read(struct ipc_log_context *ilctxt, + void *data, int data_size) +{ + int bytes_to_read; + + bytes_to_read = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->nd_read_page->hdr.nd_read_offset, + data_size); + + memcpy(data, (ilctxt->nd_read_page->data + + ilctxt->nd_read_page->hdr.nd_read_offset), bytes_to_read); + + if (bytes_to_read != data_size) { + /* not enough space, wrap read to next page */ + ilctxt->nd_read_page->hdr.nd_read_offset = 0; + ilctxt->nd_read_page = get_next_page(ilctxt, + ilctxt->nd_read_page); + BUG_ON(ilctxt->nd_read_page == NULL); + + memcpy((data + bytes_to_read), + (ilctxt->nd_read_page->data + + ilctxt->nd_read_page->hdr.nd_read_offset), + (data_size - bytes_to_read)); + bytes_to_read = (data_size - bytes_to_read); + } + ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read; +} + +/** + * ipc_log_drop - do destructive read of the log + * + * @ilctxt: Logging context + * @data: Data pointer to receive the data (or NULL) + * @data_size: Number of bytes to read (must be <= bytes available in log) + */ +static void ipc_log_drop(struct ipc_log_context *ilctxt, void *data, + int data_size) +{ + int bytes_to_read; + bool push_nd_read; + + bytes_to_read = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->read_page->hdr.read_offset, + data_size); + if (data) + memcpy(data, (ilctxt->read_page->data + + ilctxt->read_page->hdr.read_offset), bytes_to_read); + + if (bytes_to_read != data_size) { + /* not enough space, wrap read to next page */ + push_nd_read = is_nd_read_equal_read(ilctxt); + + ilctxt->read_page->hdr.read_offset = 0; + if (push_nd_read) { + ilctxt->read_page->hdr.nd_read_offset = 0; + ilctxt->read_page = get_next_page(ilctxt, + ilctxt->read_page); + BUG_ON(ilctxt->read_page == NULL); + ilctxt->nd_read_page = ilctxt->read_page; + } else { + ilctxt->read_page = get_next_page(ilctxt, + ilctxt->read_page); + BUG_ON(ilctxt->read_page == NULL); + } + + if (data) + memcpy((data + bytes_to_read), + (ilctxt->read_page->data + + ilctxt->read_page->hdr.read_offset), + (data_size - bytes_to_read)); + + bytes_to_read = (data_size - bytes_to_read); + } + + /* update non-destructive read pointer if necessary */ + push_nd_read = is_nd_read_equal_read(ilctxt); + ilctxt->read_page->hdr.read_offset += bytes_to_read; + ilctxt->write_avail += data_size; + + if (push_nd_read) + ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read; +} + +/** + * msg_read - Reads a message. + * + * If a message is read successfully, then the message context + * will be set to: + * .hdr message header .size and .type values + * .offset beginning of message data + * + * @ilctxt Logging context + * @ectxt Message context + * + * @returns 0 - no message available; >0 message size; <0 error + */ +static int msg_read(struct ipc_log_context *ilctxt, + struct encode_context *ectxt) +{ + struct tsv_header hdr; + + if (!ectxt) + return -EINVAL; + + if (is_nd_read_empty(ilctxt)) + return 0; + + ipc_log_read(ilctxt, &hdr, sizeof(hdr)); + ectxt->hdr.type = hdr.type; + ectxt->hdr.size = hdr.size; + ectxt->offset = sizeof(hdr); + ipc_log_read(ilctxt, (ectxt->buff + ectxt->offset), + (int)hdr.size); + + return sizeof(hdr) + (int)hdr.size; +} + +/** + * msg_drop - Drops a message. + * + * @ilctxt Logging context + */ +static void msg_drop(struct ipc_log_context *ilctxt) +{ + struct tsv_header hdr; + + if (!is_read_empty(ilctxt)) { + ipc_log_drop(ilctxt, &hdr, sizeof(hdr)); + ipc_log_drop(ilctxt, NULL, (int)hdr.size); + } +} + +/* + * Commits messages to the FIFO. If the FIFO is full, then enough + * messages are dropped to create space for the new message. + */ +void ipc_log_write(void *ctxt, struct encode_context *ectxt) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + int bytes_to_write; + unsigned long flags; + + if (!ilctxt || !ectxt) { + pr_err("%s: Invalid ipc_log or encode context\n", __func__); + return; + } + + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + while (ilctxt->write_avail <= ectxt->offset) + msg_drop(ilctxt); + + bytes_to_write = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->write_page->hdr.write_offset, + ectxt->offset); + memcpy((ilctxt->write_page->data + + ilctxt->write_page->hdr.write_offset), + ectxt->buff, bytes_to_write); + + if (bytes_to_write != ectxt->offset) { + uint64_t t_now = sched_clock(); + + ilctxt->write_page->hdr.write_offset += bytes_to_write; + ilctxt->write_page->hdr.end_time = t_now; + + ilctxt->write_page = get_next_page(ilctxt, ilctxt->write_page); + BUG_ON(ilctxt->write_page == NULL); + ilctxt->write_page->hdr.write_offset = 0; + ilctxt->write_page->hdr.start_time = t_now; + memcpy((ilctxt->write_page->data + + ilctxt->write_page->hdr.write_offset), + (ectxt->buff + bytes_to_write), + (ectxt->offset - bytes_to_write)); + bytes_to_write = (ectxt->offset - bytes_to_write); + } + ilctxt->write_page->hdr.write_offset += bytes_to_write; + ilctxt->write_avail -= ectxt->offset; + complete(&ilctxt->read_avail); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); +} +EXPORT_SYMBOL(ipc_log_write); + +/* + * Starts a new message after which you can add serialized data and + * then complete the message by calling msg_encode_end(). + */ +void msg_encode_start(struct encode_context *ectxt, uint32_t type) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return; + } + + ectxt->hdr.type = type; + ectxt->hdr.size = 0; + ectxt->offset = sizeof(ectxt->hdr); +} +EXPORT_SYMBOL(msg_encode_start); + +/* + * Completes the message + */ +void msg_encode_end(struct encode_context *ectxt) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return; + } + + /* finalize data size */ + ectxt->hdr.size = ectxt->offset - sizeof(ectxt->hdr); + BUG_ON(ectxt->hdr.size > MAX_MSG_SIZE); + memcpy(ectxt->buff, &ectxt->hdr, sizeof(ectxt->hdr)); +} +EXPORT_SYMBOL(msg_encode_end); + +/* + * Helper funtion used to write data to a message context. + * + * @ectxt context initialized by calling msg_encode_start() + * @data data to write + * @size number of bytes of data to write + */ +static inline int tsv_write_data(struct encode_context *ectxt, + void *data, uint32_t size) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return -EINVAL; + } + if ((ectxt->offset + size) > MAX_MSG_SIZE) { + pr_err("%s: No space to encode further\n", __func__); + return -EINVAL; + } + + memcpy((void *)(ectxt->buff + ectxt->offset), data, size); + ectxt->offset += size; + return 0; +} + +/* + * Helper function that writes a type to the context. + * + * @ectxt context initialized by calling msg_encode_start() + * @type primitive type + * @size size of primitive in bytes + */ +static inline int tsv_write_header(struct encode_context *ectxt, + uint32_t type, uint32_t size) +{ + struct tsv_header hdr; + + hdr.type = (unsigned char)type; + hdr.size = (unsigned char)size; + return tsv_write_data(ectxt, &hdr, sizeof(hdr)); +} + +/* + * Writes the current timestamp count. + * + * @ectxt context initialized by calling msg_encode_start() + */ +int tsv_timestamp_write(struct encode_context *ectxt) +{ + int ret; + uint64_t t_now = sched_clock(); + + ret = tsv_write_header(ectxt, TSV_TYPE_TIMESTAMP, sizeof(t_now)); + if (ret) + return ret; + return tsv_write_data(ectxt, &t_now, sizeof(t_now)); +} +EXPORT_SYMBOL(tsv_timestamp_write); + +/* + * Writes the current QTimer timestamp count. + * + * @ectxt context initialized by calling msg_encode_start() + */ +int tsv_qtimer_write(struct encode_context *ectxt) +{ + int ret; + uint64_t t_now = arch_counter_get_cntvct(); + + ret = tsv_write_header(ectxt, TSV_TYPE_QTIMER, sizeof(t_now)); + if (ret) + return ret; + return tsv_write_data(ectxt, &t_now, sizeof(t_now)); +} +EXPORT_SYMBOL(tsv_qtimer_write); + +/* + * Writes a data pointer. + * + * @ectxt context initialized by calling msg_encode_start() + * @pointer pointer value to write + */ +int tsv_pointer_write(struct encode_context *ectxt, void *pointer) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_POINTER, sizeof(pointer)); + if (ret) + return ret; + return tsv_write_data(ectxt, &pointer, sizeof(pointer)); +} +EXPORT_SYMBOL(tsv_pointer_write); + +/* + * Writes a 32-bit integer value. + * + * @ectxt context initialized by calling msg_encode_start() + * @n integer to write + */ +int tsv_int32_write(struct encode_context *ectxt, int32_t n) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_INT32, sizeof(n)); + if (ret) + return ret; + return tsv_write_data(ectxt, &n, sizeof(n)); +} +EXPORT_SYMBOL(tsv_int32_write); + +/* + * Writes a byte array. + * + * @ectxt context initialized by calling msg_write_start() + * @data Beginning address of data + * @data_size Size of data to be written + */ +int tsv_byte_array_write(struct encode_context *ectxt, + void *data, int data_size) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_BYTE_ARRAY, data_size); + if (ret) + return ret; + return tsv_write_data(ectxt, data, data_size); +} +EXPORT_SYMBOL(tsv_byte_array_write); + +/* + * Helper function to log a string + * + * @ilctxt ipc_log_context created using ipc_log_context_create() + * @fmt Data specified using format specifiers + */ +int ipc_log_string(void *ilctxt, const char *fmt, ...) +{ + struct encode_context ectxt; + int avail_size, data_size, hdr_size = sizeof(struct tsv_header); + va_list arg_list; + + if (!ilctxt) + return -EINVAL; + + msg_encode_start(&ectxt, TSV_TYPE_STRING); + tsv_timestamp_write(&ectxt); + tsv_qtimer_write(&ectxt); + avail_size = (MAX_MSG_SIZE - (ectxt.offset + hdr_size)); + va_start(arg_list, fmt); + data_size = vsnprintf((ectxt.buff + ectxt.offset + hdr_size), + avail_size, fmt, arg_list); + va_end(arg_list); + tsv_write_header(&ectxt, TSV_TYPE_BYTE_ARRAY, data_size); + ectxt.offset += data_size; + msg_encode_end(&ectxt); + ipc_log_write(ilctxt, &ectxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_string); + +/** + * ipc_log_extract - Reads and deserializes log + * + * @ctxt: logging context + * @buff: buffer to receive the data + * @size: size of the buffer + * @returns: 0 if no data read; >0 number of bytes read; < 0 error + * + * If no data is available to be read, then the ilctxt::read_avail + * completion is reinitialized. This allows clients to block + * until new log data is save. + */ +int ipc_log_extract(void *ctxt, char *buff, int size) +{ + struct encode_context ectxt; + struct decode_context dctxt; + void (*deserialize_func)(struct encode_context *ectxt, + struct decode_context *dctxt); + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + unsigned long flags; + + if (size < MAX_MSG_DECODED_SIZE) + return -EINVAL; + + dctxt.output_format = OUTPUT_DEBUGFS; + dctxt.buff = buff; + dctxt.size = size; + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + while (dctxt.size >= MAX_MSG_DECODED_SIZE && + !is_nd_read_empty(ilctxt)) { + msg_read(ilctxt, &ectxt); + deserialize_func = get_deserialization_func(ilctxt, + ectxt.hdr.type); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + if (deserialize_func) + deserialize_func(&ectxt, &dctxt); + else + pr_err("%s: unknown message 0x%x\n", + __func__, ectxt.hdr.type); + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + } + if ((size - dctxt.size) == 0) + reinit_completion(&ilctxt->read_avail); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + return size - dctxt.size; +} +EXPORT_SYMBOL(ipc_log_extract); + +/* + * Helper funtion used to read data from a message context. + * + * @ectxt context initialized by calling msg_read() + * @data data to read + * @size number of bytes of data to read + */ +static void tsv_read_data(struct encode_context *ectxt, + void *data, uint32_t size) +{ + BUG_ON((ectxt->offset + size) > MAX_MSG_SIZE); + memcpy(data, (ectxt->buff + ectxt->offset), size); + ectxt->offset += size; +} + +/* + * Helper function that reads a type from the context and updates the + * context pointers. + * + * @ectxt context initialized by calling msg_read() + * @hdr type header + */ +static void tsv_read_header(struct encode_context *ectxt, + struct tsv_header *hdr) +{ + BUG_ON((ectxt->offset + sizeof(*hdr)) > MAX_MSG_SIZE); + memcpy(hdr, (ectxt->buff + ectxt->offset), sizeof(*hdr)); + ectxt->offset += sizeof(*hdr); +} + +/* + * Reads a timestamp. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format (appended to %6u.09u timestamp format) + */ +void tsv_timestamp_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + uint64_t val; + unsigned long nanosec_rem; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_TIMESTAMP); + tsv_read_data(ectxt, &val, sizeof(val)); + nanosec_rem = do_div(val, 1000000000U); + IPC_SPRINTF_DECODE(dctxt, "[%6u.%09lu%s/", + (unsigned)val, nanosec_rem, format); +} +EXPORT_SYMBOL(tsv_timestamp_read); + +/* + * Reads a QTimer timestamp. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format (appended to %#18llx timestamp format) + */ +void tsv_qtimer_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + uint64_t val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_QTIMER); + tsv_read_data(ectxt, &val, sizeof(val)); + + /* + * This gives 16 hex digits of output. The # prefix prepends + * a 0x, and these characters count as part of the number. + */ + IPC_SPRINTF_DECODE(dctxt, "%#18llx]%s", val, format); +} +EXPORT_SYMBOL(tsv_qtimer_read); + +/* + * Reads a data pointer. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +void tsv_pointer_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + void *val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_POINTER); + tsv_read_data(ectxt, &val, sizeof(val)); + + IPC_SPRINTF_DECODE(dctxt, format, val); +} +EXPORT_SYMBOL(tsv_pointer_read); + +/* + * Reads a 32-bit integer value. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +int32_t tsv_int32_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + int32_t val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_INT32); + tsv_read_data(ectxt, &val, sizeof(val)); + + IPC_SPRINTF_DECODE(dctxt, format, val); + return val; +} +EXPORT_SYMBOL(tsv_int32_read); + +/* + * Reads a byte array/string. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +void tsv_byte_array_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_BYTE_ARRAY); + tsv_read_data(ectxt, dctxt->buff, hdr.size); + dctxt->buff += hdr.size; + dctxt->size -= hdr.size; +} +EXPORT_SYMBOL(tsv_byte_array_read); + +int add_deserialization_func(void *ctxt, int type, + void (*dfunc)(struct encode_context *, + struct decode_context *)) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + struct dfunc_info *df_info; + unsigned long flags; + + if (!ilctxt || !dfunc) + return -EINVAL; + + df_info = kmalloc(sizeof(struct dfunc_info), GFP_KERNEL); + if (!df_info) + return -ENOSPC; + + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + df_info->type = type; + df_info->dfunc = dfunc; + list_add_tail(&df_info->list, &ilctxt->dfunc_info_list); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + return 0; +} +EXPORT_SYMBOL(add_deserialization_func); + +static void *get_deserialization_func(struct ipc_log_context *ilctxt, + int type) +{ + struct dfunc_info *df_info = NULL; + + if (!ilctxt) + return NULL; + + list_for_each_entry(df_info, &ilctxt->dfunc_info_list, list) { + if (df_info->type == type) + return df_info->dfunc; + } + return NULL; +} + +/** + * ipc_log_context_create: Create a debug log context + * Should not be called from atomic context + * + * @max_num_pages: Number of pages of logging space required (max. 10) + * @mod_name : Name of the directory entry under DEBUGFS + * @user_version : Version number of user-defined message formats + * + * returns context id on success, NULL on failure + */ +void *ipc_log_context_create(int max_num_pages, + const char *mod_name, uint16_t user_version) +{ + struct ipc_log_context *ctxt; + struct ipc_log_page *pg = NULL; + int page_cnt; + unsigned long flags; + + ctxt = kzalloc(sizeof(struct ipc_log_context), GFP_KERNEL); + if (!ctxt) { + pr_err("%s: cannot create ipc_log_context\n", __func__); + return 0; + } + + init_completion(&ctxt->read_avail); + INIT_LIST_HEAD(&ctxt->page_list); + INIT_LIST_HEAD(&ctxt->dfunc_info_list); + spin_lock_init(&ctxt->context_lock_lhb1); + for (page_cnt = 0; page_cnt < max_num_pages; page_cnt++) { + pg = kzalloc(sizeof(struct ipc_log_page), GFP_KERNEL); + if (!pg) { + pr_err("%s: cannot create ipc_log_page\n", __func__); + goto release_ipc_log_context; + } + pg->hdr.log_id = (uint64_t)(uintptr_t)ctxt; + pg->hdr.page_num = LOG_PAGE_FLAG | page_cnt; + pg->hdr.ctx_offset = (int64_t)((uint64_t)(uintptr_t)ctxt - + (uint64_t)(uintptr_t)&pg->hdr); + + /* set magic last to signal that page init is complete */ + pg->hdr.magic = IPC_LOGGING_MAGIC_NUM; + pg->hdr.nmagic = ~(IPC_LOGGING_MAGIC_NUM); + + spin_lock_irqsave(&ctxt->context_lock_lhb1, flags); + list_add_tail(&pg->hdr.list, &ctxt->page_list); + spin_unlock_irqrestore(&ctxt->context_lock_lhb1, flags); + } + + ctxt->log_id = (uint64_t)(uintptr_t)ctxt; + ctxt->version = IPC_LOG_VERSION; + strlcpy(ctxt->name, mod_name, IPC_LOG_MAX_CONTEXT_NAME_LEN); + ctxt->user_version = user_version; + ctxt->first_page = get_first_page(ctxt); + ctxt->last_page = pg; + ctxt->write_page = ctxt->first_page; + ctxt->read_page = ctxt->first_page; + ctxt->nd_read_page = ctxt->first_page; + ctxt->write_avail = max_num_pages * LOG_PAGE_DATA_SIZE; + ctxt->header_size = sizeof(struct ipc_log_page_header); + create_ctx_debugfs(ctxt, mod_name); + + /* set magic last to signal context init is complete */ + ctxt->magic = IPC_LOG_CONTEXT_MAGIC_NUM; + ctxt->nmagic = ~(IPC_LOG_CONTEXT_MAGIC_NUM); + + write_lock_irqsave(&context_list_lock_lha1, flags); + list_add_tail(&ctxt->list, &ipc_log_context_list); + write_unlock_irqrestore(&context_list_lock_lha1, flags); + return (void *)ctxt; + +release_ipc_log_context: + while (page_cnt-- > 0) { + pg = get_first_page(ctxt); + list_del(&pg->hdr.list); + kfree(pg); + } + kfree(ctxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_context_create); + +/* + * Destroy debug log context + * + * @ctxt: debug log context created by calling ipc_log_context_create API. + */ +int ipc_log_context_destroy(void *ctxt) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + struct ipc_log_page *pg = NULL; + unsigned long flags; + + if (!ilctxt) + return 0; + + while (!list_empty(&ilctxt->page_list)) { + pg = get_first_page(ctxt); + list_del(&pg->hdr.list); + kfree(pg); + } + + write_lock_irqsave(&context_list_lock_lha1, flags); + list_del(&ilctxt->list); + write_unlock_irqrestore(&context_list_lock_lha1, flags); + + debugfs_remove_recursive(ilctxt->dent); + + kfree(ilctxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_context_destroy); + +static int __init ipc_logging_init(void) +{ + check_and_create_debugfs(); + return 0; +} + +module_init(ipc_logging_init); + +MODULE_DESCRIPTION("ipc logging"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/ipc_logging_debug.c b/kernel/trace/ipc_logging_debug.c new file mode 100644 index 000000000000..a54538798f2b --- /dev/null +++ b/kernel/trace/ipc_logging_debug.c @@ -0,0 +1,184 @@ +/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/delay.h> +#include <linux/completion.h> +#include <linux/ipc_logging.h> + +#include "ipc_logging_private.h" + +static DEFINE_MUTEX(ipc_log_debugfs_init_lock); +static struct dentry *root_dent; + +static int debug_log(struct ipc_log_context *ilctxt, + char *buff, int size, int cont) +{ + int i = 0; + int ret; + + if (size < MAX_MSG_DECODED_SIZE) { + pr_err("%s: buffer size %d < %d\n", __func__, size, + MAX_MSG_DECODED_SIZE); + return -ENOMEM; + } + do { + i = ipc_log_extract(ilctxt, buff, size - 1); + if (cont && i == 0) { + ret = wait_for_completion_interruptible( + &ilctxt->read_avail); + if (ret < 0) + return ret; + } + } while (cont && i == 0); + + return i; +} + +/* + * VFS Read operation helper which dispatches the call to the debugfs + * read command stored in file->private_data. + * + * @file File structure + * @buff user buffer + * @count size of user buffer + * @ppos file position to read from (only a value of 0 is accepted) + * @cont 1 = continuous mode (don't return 0 to signal end-of-file) + * + * @returns ==0 end of file + * >0 number of bytes read + * <0 error + */ +static ssize_t debug_read_helper(struct file *file, char __user *buff, + size_t count, loff_t *ppos, int cont) +{ + struct ipc_log_context *ilctxt = file->private_data; + char *buffer; + int bsize; + + buffer = kmalloc(count, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + bsize = debug_log(ilctxt, buffer, count, cont); + if (bsize > 0) { + if (copy_to_user(buff, buffer, bsize)) { + kfree(buffer); + return -EFAULT; + } + *ppos += bsize; + } + kfree(buffer); + return bsize; +} + +static ssize_t debug_read(struct file *file, char __user *buff, + size_t count, loff_t *ppos) +{ + return debug_read_helper(file, buff, count, ppos, 0); +} + +static ssize_t debug_read_cont(struct file *file, char __user *buff, + size_t count, loff_t *ppos) +{ + return debug_read_helper(file, buff, count, ppos, 1); +} + +static int debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static const struct file_operations debug_ops = { + .read = debug_read, + .open = debug_open, +}; + +static const struct file_operations debug_ops_cont = { + .read = debug_read_cont, + .open = debug_open, +}; + +static void debug_create(const char *name, mode_t mode, + struct dentry *dent, + struct ipc_log_context *ilctxt, + const struct file_operations *fops) +{ + debugfs_create_file(name, mode, dent, ilctxt, fops); +} + +static void dfunc_string(struct encode_context *ectxt, + struct decode_context *dctxt) +{ + tsv_timestamp_read(ectxt, dctxt, ""); + tsv_qtimer_read(ectxt, dctxt, " "); + tsv_byte_array_read(ectxt, dctxt, ""); + + /* add trailing \n if necessary */ + if (*(dctxt->buff - 1) != '\n') { + if (dctxt->size) { + ++dctxt->buff; + --dctxt->size; + } + *(dctxt->buff - 1) = '\n'; + } +} + +void check_and_create_debugfs(void) +{ + mutex_lock(&ipc_log_debugfs_init_lock); + if (!root_dent) { + root_dent = debugfs_create_dir("ipc_logging", 0); + + if (IS_ERR(root_dent)) { + pr_err("%s: unable to create debugfs %ld\n", + __func__, PTR_ERR(root_dent)); + root_dent = NULL; + } + } + mutex_unlock(&ipc_log_debugfs_init_lock); +} +EXPORT_SYMBOL(check_and_create_debugfs); + +void create_ctx_debugfs(struct ipc_log_context *ctxt, + const char *mod_name) +{ + if (!root_dent) + check_and_create_debugfs(); + + if (root_dent) { + ctxt->dent = debugfs_create_dir(mod_name, root_dent); + if (!IS_ERR(ctxt->dent)) { + debug_create("log", 0444, ctxt->dent, + ctxt, &debug_ops); + debug_create("log_cont", 0444, ctxt->dent, + ctxt, &debug_ops_cont); + } + } + add_deserialization_func((void *)ctxt, + TSV_TYPE_STRING, dfunc_string); +} +EXPORT_SYMBOL(create_ctx_debugfs); diff --git a/kernel/trace/ipc_logging_private.h b/kernel/trace/ipc_logging_private.h new file mode 100644 index 000000000000..3ac950695086 --- /dev/null +++ b/kernel/trace/ipc_logging_private.h @@ -0,0 +1,165 @@ +/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef _IPC_LOGGING_PRIVATE_H +#define _IPC_LOGGING_PRIVATE_H + +#include <linux/ipc_logging.h> + +#define IPC_LOG_VERSION 0x0003 +#define IPC_LOG_MAX_CONTEXT_NAME_LEN 32 + +/** + * struct ipc_log_page_header - Individual log page header + * + * @magic: Magic number (used for log extraction) + * @nmagic: Inverse of magic number (used for log extraction) + * @page_num: Index of page (0.. N - 1) (note top bit is always set) + * @read_offset: Read offset in page + * @write_offset: Write offset in page (or 0xFFFF if full) + * @log_id: ID of logging context that owns this page + * @start_time: Scheduler clock for first write time in page + * @end_time: Scheduler clock for last write time in page + * @ctx_offset: Signed offset from page to the logging context. Used to + * optimize ram-dump extraction. + * + * @list: Linked list of pages that make up a log + * @nd_read_offset: Non-destructive read offset used for debugfs + * + * The first part of the structure defines data that is used to extract the + * logs from a memory dump and elements in this section should not be changed + * or re-ordered. New local data structures can be added to the end of the + * structure since they will be ignored by the extraction tool. + */ +struct ipc_log_page_header { + uint32_t magic; + uint32_t nmagic; + uint32_t page_num; + uint16_t read_offset; + uint16_t write_offset; + uint64_t log_id; + uint64_t start_time; + uint64_t end_time; + int64_t ctx_offset; + + /* add local data structures after this point */ + struct list_head list; + uint16_t nd_read_offset; +}; + +/** + * struct ipc_log_page - Individual log page + * + * @hdr: Log page header + * @data: Log data + * + * Each log consists of 1 to N log pages. Data size is adjusted to always fit + * the structure into a single kernel page. + */ +struct ipc_log_page { + struct ipc_log_page_header hdr; + char data[PAGE_SIZE - sizeof(struct ipc_log_page_header)]; +}; + +/** + * struct ipc_log_context - main logging context + * + * @magic: Magic number (used for log extraction) + * @nmagic: Inverse of magic number (used for log extraction) + * @version: IPC Logging version of log format + * @user_version: Version number for user-defined messages + * @header_size: Size of the log header which is used to determine the offset + * of ipc_log_page::data + * @log_id: Log ID (assigned when log is created) + * @name: Name of the log used to uniquely identify the log during extraction + * + * @list: List of log contexts (struct ipc_log_context) + * @page_list: List of log pages (struct ipc_log_page) + * @first_page: First page in list of logging pages + * @last_page: Last page in list of logging pages + * @write_page: Current write page + * @read_page: Current read page (for internal reads) + * @nd_read_page: Current debugfs extraction page (non-destructive) + * + * @write_avail: Number of bytes available to write in all pages + * @dent: Debugfs node for run-time log extraction + * @dfunc_info_list: List of deserialization functions + * @context_lock_lhb1: Lock for entire structure + * @read_avail: Completed when new data is added to the log + */ +struct ipc_log_context { + uint32_t magic; + uint32_t nmagic; + uint32_t version; + uint16_t user_version; + uint16_t header_size; + uint64_t log_id; + char name[IPC_LOG_MAX_CONTEXT_NAME_LEN]; + + /* add local data structures after this point */ + struct list_head list; + struct list_head page_list; + struct ipc_log_page *first_page; + struct ipc_log_page *last_page; + struct ipc_log_page *write_page; + struct ipc_log_page *read_page; + struct ipc_log_page *nd_read_page; + + uint32_t write_avail; + struct dentry *dent; + struct list_head dfunc_info_list; + spinlock_t context_lock_lhb1; + struct completion read_avail; +}; + +struct dfunc_info { + struct list_head list; + int type; + void (*dfunc) (struct encode_context *, struct decode_context *); +}; + +enum { + TSV_TYPE_INVALID, + TSV_TYPE_TIMESTAMP, + TSV_TYPE_POINTER, + TSV_TYPE_INT32, + TSV_TYPE_BYTE_ARRAY, + TSV_TYPE_QTIMER, +}; + +enum { + OUTPUT_DEBUGFS, +}; + +#define IPC_LOG_CONTEXT_MAGIC_NUM 0x25874452 +#define IPC_LOGGING_MAGIC_NUM 0x52784425 +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define IS_MSG_TYPE(x) (((x) > TSV_TYPE_MSG_START) && \ + ((x) < TSV_TYPE_MSG_END)) +#define MAX_MSG_DECODED_SIZE (MAX_MSG_SIZE*4) + +#if (defined(CONFIG_DEBUG_FS)) +void check_and_create_debugfs(void); + +void create_ctx_debugfs(struct ipc_log_context *ctxt, + const char *mod_name); +#else +void check_and_create_debugfs(void) +{ +} + +void create_ctx_debugfs(struct ipc_log_context *ctxt, const char *mod_name) +{ +} +#endif + +#endif diff --git a/kernel/trace/msm_rtb.c b/kernel/trace/msm_rtb.c new file mode 100644 index 000000000000..ba609d5eb07f --- /dev/null +++ b/kernel/trace/msm_rtb.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/atomic.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/dma-mapping.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/atomic.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/io.h> +#include <asm-generic/sizes.h> +#include <linux/msm_rtb.h> + +#define SENTINEL_BYTE_1 0xFF +#define SENTINEL_BYTE_2 0xAA +#define SENTINEL_BYTE_3 0xFF + +#define RTB_COMPAT_STR "qcom,msm-rtb" + +/* Write + * 1) 3 bytes sentinel + * 2) 1 bytes of log type + * 3) 8 bytes of where the caller came from + * 4) 4 bytes index + * 4) 8 bytes extra data from the caller + * 5) 8 bytes of timestamp + * + * Total = 32 bytes. + */ +struct msm_rtb_layout { + unsigned char sentinel[3]; + unsigned char log_type; + uint32_t idx; + uint64_t caller; + uint64_t data; + uint64_t timestamp; +} __attribute__ ((__packed__)); + + +struct msm_rtb_state { + struct msm_rtb_layout *rtb; + phys_addr_t phys; + int nentries; + int size; + int enabled; + int initialized; + uint32_t filter; + int step_size; +}; + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) +DEFINE_PER_CPU(atomic_t, msm_rtb_idx_cpu); +#else +static atomic_t msm_rtb_idx; +#endif + +static struct msm_rtb_state msm_rtb = { + .filter = 1 << LOGK_LOGBUF, + .enabled = 1, +}; + +module_param_named(filter, msm_rtb.filter, uint, 0644); +module_param_named(enable, msm_rtb.enabled, int, 0644); + +static int msm_rtb_panic_notifier(struct notifier_block *this, + unsigned long event, void *ptr) +{ + msm_rtb.enabled = 0; + return NOTIFY_DONE; +} + +static struct notifier_block msm_rtb_panic_blk = { + .notifier_call = msm_rtb_panic_notifier, + .priority = INT_MAX, +}; + +int notrace msm_rtb_event_should_log(enum logk_event_type log_type) +{ + return msm_rtb.initialized && msm_rtb.enabled && + ((1 << (log_type & ~LOGTYPE_NOPC)) & msm_rtb.filter); +} +EXPORT_SYMBOL(msm_rtb_event_should_log); + +static void msm_rtb_emit_sentinel(struct msm_rtb_layout *start) +{ + start->sentinel[0] = SENTINEL_BYTE_1; + start->sentinel[1] = SENTINEL_BYTE_2; + start->sentinel[2] = SENTINEL_BYTE_3; +} + +static void msm_rtb_write_type(enum logk_event_type log_type, + struct msm_rtb_layout *start) +{ + start->log_type = (char)log_type; +} + +static void msm_rtb_write_caller(uint64_t caller, struct msm_rtb_layout *start) +{ + start->caller = caller; +} + +static void msm_rtb_write_idx(uint32_t idx, + struct msm_rtb_layout *start) +{ + start->idx = idx; +} + +static void msm_rtb_write_data(uint64_t data, struct msm_rtb_layout *start) +{ + start->data = data; +} + +static void msm_rtb_write_timestamp(struct msm_rtb_layout *start) +{ + start->timestamp = sched_clock(); +} + +static void uncached_logk_pc_idx(enum logk_event_type log_type, uint64_t caller, + uint64_t data, int idx) +{ + struct msm_rtb_layout *start; + + start = &msm_rtb.rtb[idx & (msm_rtb.nentries - 1)]; + + msm_rtb_emit_sentinel(start); + msm_rtb_write_type(log_type, start); + msm_rtb_write_caller(caller, start); + msm_rtb_write_idx(idx, start); + msm_rtb_write_data(data, start); + msm_rtb_write_timestamp(start); + mb(); + + return; +} + +static void uncached_logk_timestamp(int idx) +{ + unsigned long long timestamp; + + timestamp = sched_clock(); + uncached_logk_pc_idx(LOGK_TIMESTAMP|LOGTYPE_NOPC, + (uint64_t)lower_32_bits(timestamp), + (uint64_t)upper_32_bits(timestamp), idx); +} + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) +static int msm_rtb_get_idx(void) +{ + int cpu, i, offset; + atomic_t *index; + + /* + * ideally we would use get_cpu but this is a close enough + * approximation for our purposes. + */ + cpu = raw_smp_processor_id(); + + index = &per_cpu(msm_rtb_idx_cpu, cpu); + + i = atomic_add_return(msm_rtb.step_size, index); + i -= msm_rtb.step_size; + + /* Check if index has wrapped around */ + offset = (i & (msm_rtb.nentries - 1)) - + ((i - msm_rtb.step_size) & (msm_rtb.nentries - 1)); + if (offset < 0) { + uncached_logk_timestamp(i); + i = atomic_add_return(msm_rtb.step_size, index); + i -= msm_rtb.step_size; + } + + return i; +} +#else +static int msm_rtb_get_idx(void) +{ + int i, offset; + + i = atomic_inc_return(&msm_rtb_idx); + i--; + + /* Check if index has wrapped around */ + offset = (i & (msm_rtb.nentries - 1)) - + ((i - 1) & (msm_rtb.nentries - 1)); + if (offset < 0) { + uncached_logk_timestamp(i); + i = atomic_inc_return(&msm_rtb_idx); + i--; + } + + return i; +} +#endif + +int notrace uncached_logk_pc(enum logk_event_type log_type, void *caller, + void *data) +{ + int i; + + if (!msm_rtb_event_should_log(log_type)) + return 0; + + i = msm_rtb_get_idx(); + uncached_logk_pc_idx(log_type, (uint64_t)((unsigned long) caller), + (uint64_t)((unsigned long) data), i); + + return 1; +} +EXPORT_SYMBOL(uncached_logk_pc); + +noinline int notrace uncached_logk(enum logk_event_type log_type, void *data) +{ + return uncached_logk_pc(log_type, __builtin_return_address(0), data); +} +EXPORT_SYMBOL(uncached_logk); + +static int msm_rtb_probe(struct platform_device *pdev) +{ + struct msm_rtb_platform_data *d = pdev->dev.platform_data; +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) + unsigned int cpu; +#endif + int ret; + + if (!pdev->dev.of_node) { + msm_rtb.size = d->size; + } else { + u64 size; + struct device_node *pnode; + + pnode = of_parse_phandle(pdev->dev.of_node, + "linux,contiguous-region", 0); + if (pnode != NULL) { + const u32 *addr; + + addr = of_get_address(pnode, 0, &size, NULL); + if (!addr) { + of_node_put(pnode); + return -EINVAL; + } + of_node_put(pnode); + } else { + ret = of_property_read_u32(pdev->dev.of_node, + "qcom,rtb-size", + (u32 *)&size); + if (ret < 0) + return ret; + + } + + msm_rtb.size = size; + } + + if (msm_rtb.size <= 0 || msm_rtb.size > SZ_1M) + return -EINVAL; + + msm_rtb.rtb = dma_alloc_coherent(&pdev->dev, msm_rtb.size, + &msm_rtb.phys, + GFP_KERNEL); + + if (!msm_rtb.rtb) + return -ENOMEM; + + msm_rtb.nentries = msm_rtb.size / sizeof(struct msm_rtb_layout); + + /* Round this down to a power of 2 */ + msm_rtb.nentries = __rounddown_pow_of_two(msm_rtb.nentries); + + memset(msm_rtb.rtb, 0, msm_rtb.size); + + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) + for_each_possible_cpu(cpu) { + atomic_t *a = &per_cpu(msm_rtb_idx_cpu, cpu); + atomic_set(a, cpu); + } + msm_rtb.step_size = num_possible_cpus(); +#else + atomic_set(&msm_rtb_idx, 0); + msm_rtb.step_size = 1; +#endif + + atomic_notifier_chain_register(&panic_notifier_list, + &msm_rtb_panic_blk); + msm_rtb.initialized = 1; + return 0; +} + +static struct of_device_id msm_match_table[] = { + {.compatible = RTB_COMPAT_STR}, + {}, +}; + +static struct platform_driver msm_rtb_driver = { + .driver = { + .name = "msm_rtb", + .owner = THIS_MODULE, + .of_match_table = msm_match_table + }, +}; + +static int __init msm_rtb_init(void) +{ + return platform_driver_probe(&msm_rtb_driver, msm_rtb_probe); +} + +static void __exit msm_rtb_exit(void) +{ + platform_driver_unregister(&msm_rtb_driver); +} +module_init(msm_rtb_init) +module_exit(msm_rtb_exit) diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..9270e1ac6460 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); - +EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy); +EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8305cbb2d5a2..ae68222c5a74 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,6 +41,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/sched/rt.h> +#include <linux/coresight-stm.h> #include "trace.h" #include "trace_output.h" @@ -573,8 +574,11 @@ int __trace_puts(unsigned long ip, const char *str, int size) if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; - } else + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 2); + } else { entry->buf[size] = '\0'; + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 1); + } __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); @@ -615,6 +619,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; + stm_log(OST_ENTITY_TRACE_PRINTK, entry->str, strlen(entry->str)+1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); @@ -1352,6 +1357,7 @@ void tracing_reset_all_online_cpus(void) #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT]; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -1590,7 +1596,7 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - + saved_tgids[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); return 1; @@ -1633,6 +1639,25 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } +int trace_find_tgid(int pid) +{ + unsigned map; + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + map = savedcmd->map_pid_to_cmdline[pid]; + if (map != NO_CMDLINE_MAP) + tgid = saved_tgids[map]; + else + tgid = -1; + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return tgid; +} + void tracing_record_cmdline(struct task_struct *tsk) { if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) @@ -2220,6 +2245,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, len + 1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } @@ -2583,6 +2609,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) "# | | | | |\n"); } +static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | | |\n"); +} + static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); @@ -2595,6 +2628,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file "# | | | |||| | |\n"); } +static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |||| | |\n"); +} + void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { @@ -2907,9 +2952,15 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + if (trace_flags & TRACE_ITER_TGID) + print_func_help_header_irq_tgid(iter->trace_buffer, m); + else + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->trace_buffer, m); + if (trace_flags & TRACE_ITER_TGID) + print_func_help_header_tgid(iter->trace_buffer, m); + else + print_func_help_header(iter->trace_buffer, m); } } } @@ -4161,6 +4212,50 @@ static void trace_insert_enum_map(struct module *mod, } static ssize_t +tracing_saved_tgids_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) { + int tgid; + int r; + + pid = savedcmd->map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + tgid = trace_find_tgid(pid); + r = sprintf(buf, "%d %d\n", pid, tgid); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + + return len; +} + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_open_generic, + .read = tracing_saved_tgids_read, + .llseek = generic_file_llseek, +}; + +static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -5179,8 +5274,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; - } else + stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 2); + } else { entry->buf[cnt] = '\0'; + stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 1); + } __buffer_unlock_commit(buffer, event); @@ -6787,6 +6885,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + tr, &tracing_saved_tgids_fops); + trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 919d9d07686f..e1265f95457f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -656,6 +656,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -970,7 +971,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + C(TGID, "print-tgid"), /* * By defining C, we can make TRACE_FLAGS a list of bit names diff --git a/kernel/trace/trace_cpu_freq_switch.c b/kernel/trace/trace_cpu_freq_switch.c new file mode 100644 index 000000000000..f9dab6c4bb72 --- /dev/null +++ b/kernel/trace/trace_cpu_freq_switch.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2012, 2016 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hrtimer.h> +#include <linux/tracefs.h> +#include <linux/ktime.h> +#include <trace/events/power.h> +#include "trace_stat.h" +#include "trace.h" + +struct trans { + struct rb_node node; + unsigned int cpu; + unsigned int start_freq; + unsigned int end_freq; + unsigned int min_us; + unsigned int max_us; + ktime_t total_t; + unsigned int count; +}; +static struct rb_root freq_trans_tree = RB_ROOT; + +static struct trans *tr_search(struct rb_root *root, unsigned int cpu, + unsigned int start_freq, unsigned int end_freq) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct trans *tr = container_of(node, struct trans, node); + + if (cpu < tr->cpu) + node = node->rb_left; + else if (cpu > tr->cpu) + node = node->rb_right; + else if (start_freq < tr->start_freq) + node = node->rb_left; + else if (start_freq > tr->start_freq) + node = node->rb_right; + else if (end_freq < tr->end_freq) + node = node->rb_left; + else if (end_freq > tr->end_freq) + node = node->rb_right; + else + return tr; + } + return NULL; +} + +static int tr_insert(struct rb_root *root, struct trans *tr) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct trans *this = container_of(*new, struct trans, node); + + parent = *new; + if (tr->cpu < this->cpu) + new = &((*new)->rb_left); + else if (tr->cpu > this->cpu) + new = &((*new)->rb_right); + else if (tr->start_freq < this->start_freq) + new = &((*new)->rb_left); + else if (tr->start_freq > this->start_freq) + new = &((*new)->rb_right); + else if (tr->end_freq < this->end_freq) + new = &((*new)->rb_left); + else if (tr->end_freq > this->end_freq) + new = &((*new)->rb_right); + else + return -EINVAL; + } + + rb_link_node(&tr->node, parent, new); + rb_insert_color(&tr->node, root); + + return 0; +} + +struct trans_state { + spinlock_t lock; + unsigned int start_freq; + unsigned int end_freq; + ktime_t start_t; + bool started; +}; +static DEFINE_PER_CPU(struct trans_state, freq_trans_state); + +static DEFINE_SPINLOCK(state_lock); + +static void probe_start(void *ignore, unsigned int start_freq, + unsigned int end_freq, unsigned int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + per_cpu(freq_trans_state, cpu).start_freq = start_freq; + per_cpu(freq_trans_state, cpu).end_freq = end_freq; + per_cpu(freq_trans_state, cpu).start_t = ktime_get(); + per_cpu(freq_trans_state, cpu).started = true; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void probe_end(void *ignore, unsigned int cpu) +{ + unsigned long flags; + struct trans *tr; + s64 dur_us; + ktime_t dur_t, end_t = ktime_get(); + + spin_lock_irqsave(&state_lock, flags); + + if (!per_cpu(freq_trans_state, cpu).started) + goto out; + + dur_t = ktime_sub(end_t, per_cpu(freq_trans_state, cpu).start_t); + dur_us = ktime_to_us(dur_t); + + tr = tr_search(&freq_trans_tree, cpu, + per_cpu(freq_trans_state, cpu).start_freq, + per_cpu(freq_trans_state, cpu).end_freq); + if (!tr) { + tr = kzalloc(sizeof(*tr), GFP_ATOMIC); + if (!tr) { + WARN_ONCE(1, "CPU frequency trace is now invalid!\n"); + goto out; + } + + tr->start_freq = per_cpu(freq_trans_state, cpu).start_freq; + tr->end_freq = per_cpu(freq_trans_state, cpu).end_freq; + tr->cpu = cpu; + tr->min_us = UINT_MAX; + tr_insert(&freq_trans_tree, tr); + } + tr->total_t = ktime_add(tr->total_t, dur_t); + tr->count++; + + if (dur_us > tr->max_us) + tr->max_us = dur_us; + if (dur_us < tr->min_us) + tr->min_us = dur_us; + + per_cpu(freq_trans_state, cpu).started = false; +out: + spin_unlock_irqrestore(&state_lock, flags); +} + +static void *freq_switch_stat_start(struct tracer_stat *trace) +{ + struct rb_node *n; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + n = rb_first(&freq_trans_tree); + spin_unlock_irqrestore(&state_lock, flags); + + return n; +} + +static void *freq_switch_stat_next(void *prev, int idx) +{ + struct rb_node *n; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + n = rb_next(prev); + spin_unlock_irqrestore(&state_lock, flags); + + return n; +} + +static int freq_switch_stat_show(struct seq_file *s, void *p) +{ + unsigned long flags; + struct trans *tr = p; + + spin_lock_irqsave(&state_lock, flags); + seq_printf(s, "%3d %9d %8d %5d %6lld %6d %6d\n", tr->cpu, + tr->start_freq, tr->end_freq, tr->count, + div_s64(ktime_to_us(tr->total_t), tr->count), + tr->min_us, tr->max_us); + spin_unlock_irqrestore(&state_lock, flags); + + return 0; +} + +static void freq_switch_stat_release(void *stat) +{ + struct trans *tr = stat; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + rb_erase(&tr->node, &freq_trans_tree); + spin_unlock_irqrestore(&state_lock, flags); + kfree(tr); +} + +static int freq_switch_stat_headers(struct seq_file *s) +{ + seq_puts(s, "CPU START_KHZ END_KHZ COUNT AVG_US MIN_US MAX_US\n"); + seq_puts(s, " | | | | | | |\n"); + return 0; +} + +struct tracer_stat freq_switch_stats __read_mostly = { + .name = "cpu_freq_switch", + .stat_start = freq_switch_stat_start, + .stat_next = freq_switch_stat_next, + .stat_show = freq_switch_stat_show, + .stat_release = freq_switch_stat_release, + .stat_headers = freq_switch_stat_headers +}; + +static void trace_freq_switch_disable(void) +{ + unregister_stat_tracer(&freq_switch_stats); + unregister_trace_cpu_frequency_switch_end(probe_end, NULL); + unregister_trace_cpu_frequency_switch_start(probe_start, NULL); + pr_info("disabled cpu frequency switch time profiling\n"); +} + +static int trace_freq_switch_enable(void) +{ + int ret; + + ret = register_trace_cpu_frequency_switch_start(probe_start, NULL); + if (ret) + goto out; + + ret = register_trace_cpu_frequency_switch_end(probe_end, NULL); + if (ret) + goto err_register_switch_end; + + ret = register_stat_tracer(&freq_switch_stats); + if (ret) + goto err_register_stat_tracer; + + pr_info("enabled cpu frequency switch time profiling\n"); + return 0; + +err_register_stat_tracer: + unregister_trace_cpu_frequency_switch_end(probe_end, NULL); +err_register_switch_end: + register_trace_cpu_frequency_switch_start(probe_start, NULL); +out: + pr_err("failed to enable cpu frequency switch time profiling\n"); + + return ret; +} + +static DEFINE_MUTEX(debugfs_lock); +static bool trace_freq_switch_enabled; + +static int debug_toggle_tracing(void *data, u64 val) +{ + int ret = 0; + + mutex_lock(&debugfs_lock); + + if (val == 1 && !trace_freq_switch_enabled) + ret = trace_freq_switch_enable(); + else if (val == 0 && trace_freq_switch_enabled) + trace_freq_switch_disable(); + else if (val > 1) + ret = -EINVAL; + + if (!ret) + trace_freq_switch_enabled = val; + + mutex_unlock(&debugfs_lock); + + return ret; +} + +static int debug_tracing_state_get(void *data, u64 *val) +{ + mutex_lock(&debugfs_lock); + *val = trace_freq_switch_enabled; + mutex_unlock(&debugfs_lock); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(debug_tracing_state_fops, debug_tracing_state_get, + debug_toggle_tracing, "%llu\n"); + +static int __init trace_freq_switch_init(void) +{ + struct dentry *d_tracer = tracing_init_dentry(); + + if (IS_ERR(d_tracer)) + return 0; + + tracefs_create_file("cpu_freq_switch_profile_enabled", + S_IRUGO | S_IWUSR, d_tracer, NULL, &debug_tracing_state_fops); + + return 0; +} +late_initcall(trace_freq_switch_init); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a9319be..731f6484b811 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags) void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); + if (!hlist_unhashed(&p_event->hlist_entry)) + hlist_del_rcu(&p_event->hlist_entry); tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d202d991edae..fda3b6e1b3a0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -287,14 +287,15 @@ static void output_printk(struct trace_event_buffer *fbuffer) spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer, + unsigned long len) { if (tracepoint_printk) output_printk(fbuffer); event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); + fbuffer->flags, fbuffer->pc, len); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..4641bdb40f8f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -64,6 +64,9 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 +/* Flag options */ +#define TRACE_GRAPH_PRINT_FLAT 0x80 + static unsigned int max_depth; static struct tracer_opt trace_opts[] = { @@ -87,6 +90,8 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, /* Include time within nested functions */ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, + /* Use standard trace formatting rather than hierarchical */ + { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) }, { } /* Empty entry */ }; @@ -1165,6 +1170,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) int cpu = iter->cpu; int ret; + if (flags & TRACE_GRAPH_PRINT_FLAT) + return TRACE_TYPE_UNHANDLED; + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; return TRACE_TYPE_HANDLED; @@ -1222,13 +1230,6 @@ print_graph_function(struct trace_iterator *iter) return print_graph_function_flags(iter, tracer_flags.val); } -static enum print_line_t -print_graph_function_event(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return print_graph_function(iter); -} - static void print_lat_header(struct seq_file *s, u32 flags) { static const char spaces[] = " " /* 16 spaces */ @@ -1297,6 +1298,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; + if (flags & TRACE_GRAPH_PRINT_FLAT) { + trace_default_header(s); + return; + } + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; @@ -1378,19 +1384,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) return 0; } -static struct trace_event_functions graph_functions = { - .trace = print_graph_function_event, -}; - -static struct trace_event graph_trace_entry_event = { - .type = TRACE_GRAPH_ENT, - .funcs = &graph_functions, -}; - -static struct trace_event graph_trace_ret_event = { - .type = TRACE_GRAPH_RET, - .funcs = &graph_functions -}; static struct tracer graph_trace __tracer_data = { .name = "function_graph", @@ -1467,16 +1460,6 @@ static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); - if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - - if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - return register_tracer(&graph_trace); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..3bc4b6de0f4d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -526,11 +526,21 @@ int trace_print_context(struct trace_iterator *iter) unsigned long long t; unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; + int tgid; trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + + if (tr->trace_flags & TRACE_ITER_TGID) { + tgid = trace_find_tgid(entry->pid); + if (tgid < 0) + trace_seq_puts(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + + trace_seq_printf(s, "[%03d] ", iter->cpu); if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); @@ -845,6 +855,174 @@ static struct trace_event trace_fn_event = { .funcs = &trace_fn_funcs, }; +/* TRACE_GRAPH_ENT */ +static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct ftrace_graph_ent_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_puts(s, "graph_ent: func="); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (!seq_print_ip_sym(s, field->graph_ent.func, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + trace_seq_puts(s, "\n"); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %d\n", + field->graph_ent.func, + field->graph_ent.depth); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->graph_ent.func); + SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->graph_ent.func); + SEQ_PUT_FIELD(s, field->graph_ent.depth); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_graph_ent_funcs = { + .trace = trace_graph_ent_trace, + .raw = trace_graph_ent_raw, + .hex = trace_graph_ent_hex, + .binary = trace_graph_ent_bin, +}; + +static struct trace_event trace_graph_ent_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &trace_graph_ent_funcs, +}; + +/* TRACE_GRAPH_RET */ +static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_graph_ret_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, "graph_ret: func="); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (!seq_print_ip_sym(s, field->ret.func, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + trace_seq_puts(s, "\n"); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n", + field->ret.func, + field->ret.calltime, + field->ret.rettime, + field->ret.overrun, + field->ret.depth); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->ret.func); + SEQ_PUT_HEX_FIELD(s, field->ret.calltime); + SEQ_PUT_HEX_FIELD(s, field->ret.rettime); + SEQ_PUT_HEX_FIELD(s, field->ret.overrun); + SEQ_PUT_HEX_FIELD(s, field->ret.depth); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->ret.func); + SEQ_PUT_FIELD(s, field->ret.calltime); + SEQ_PUT_FIELD(s, field->ret.rettime); + SEQ_PUT_FIELD(s, field->ret.overrun); + SEQ_PUT_FIELD(s, field->ret.depth); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_graph_ret_funcs = { + .trace = trace_graph_ret_trace, + .raw = trace_graph_ret_raw, + .hex = trace_graph_ret_hex, + .binary = trace_graph_ret_bin, +}; + +static struct trace_event trace_graph_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &trace_graph_ret_funcs, +}; + /* TRACE_CTX an TRACE_WAKE */ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) @@ -1222,6 +1400,8 @@ static struct trace_event trace_print_event = { static struct trace_event *events[] __initdata = { &trace_fn_event, + &trace_graph_ent_event, + &trace_graph_ret_event, &trace_ctx_event, &trace_wake_event, &trace_stack_event, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9d4399b553a3..78f04e4ad829 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -359,7 +359,8 @@ static bool report_latency(struct trace_array *tr, cycle_t delta) } static void -probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu, + unsigned int load) { if (task != wakeup_task) return; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..a01740a98afa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -336,7 +336,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + irq_flags, pc, 0); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -382,7 +382,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->ret = syscall_get_return_value(current, regs); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + irq_flags, pc, 0); } static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..23515a716748 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -821,7 +821,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0, 0); } /* uprobe handler */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 198137b1cadc..9472691c1eb0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -103,6 +104,11 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static cpumask_t __read_mostly watchdog_cpus; +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif static unsigned long soft_lockup_nmi_warn; @@ -114,7 +120,9 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static unsigned long hardlockup_allcpu_dumped; +#endif /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -225,7 +233,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +249,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +268,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -271,7 +294,7 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* watchdog detector functions */ static bool is_hardlockup(void) { @@ -285,6 +308,76 @@ static bool is_hardlockup(void) } #endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static unsigned int watchdog_next_cpu(unsigned int cpu) +{ + cpumask_t cpus = watchdog_cpus; + unsigned int next_cpu; + + next_cpu = cpumask_next(cpu, &cpus); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(&cpus); + + if (next_cpu == cpu) + return nr_cpu_ids; + + return next_cpu; +} + +static int is_hardlockup_other_cpu(unsigned int cpu) +{ + unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return 1; + + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + return 0; +} + +static void watchdog_check_hardlockup_other_cpu(void) +{ + unsigned int next_cpu; + + /* + * Test for hardlockups every 3 samples. The sample period is + * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over + * watchdog_thresh (over by 20%). + */ + if (__this_cpu_read(hrtimer_interrupts) % 3 != 0) + return; + + /* check for a hardlockup on the next cpu */ + next_cpu = watchdog_next_cpu(smp_processor_id()); + if (next_cpu >= nr_cpu_ids) + return; + + smp_rmb(); + + if (per_cpu(watchdog_nmi_touch, next_cpu) == true) { + per_cpu(watchdog_nmi_touch, next_cpu) = false; + return; + } + + if (is_hardlockup_other_cpu(next_cpu)) { + /* only warn once */ + if (per_cpu(hard_watchdog_warn, next_cpu) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); + + per_cpu(hard_watchdog_warn, next_cpu) = true; + } else { + per_cpu(hard_watchdog_warn, next_cpu) = false; + } +} +#else +static inline void watchdog_check_hardlockup_other_cpu(void) { return; } +#endif + static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -297,7 +390,7 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -360,7 +453,7 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static void watchdog_interrupt_count(void) { @@ -384,6 +477,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* kick the hardlockup detector */ watchdog_interrupt_count(); + /* test for hardlockups on the next cpu */ + watchdog_check_hardlockup_other_cpu(); + /* kick the softlockup detector */ wake_up_process(__this_cpu_read(softlockup_watchdog)); @@ -561,7 +657,7 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* * People like the simple clean cpu node info on boot. * Reduce the watchdog noise by only printing messages @@ -660,9 +756,44 @@ static void watchdog_nmi_disable(unsigned int cpu) } #else +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static int watchdog_nmi_enable(unsigned int cpu) +{ + /* + * The new cpu will be marked online before the first hrtimer interrupt + * runs on it. If another cpu tests for a hardlockup on the new cpu + * before it has run its first hrtimer, it will get a false positive. + * Touch the watchdog on the new cpu to delay the first check for at + * least 3 sampling periods to guarantee one hrtimer has run on the new + * cpu. + */ + per_cpu(watchdog_nmi_touch, cpu) = true; + smp_wmb(); + cpumask_set_cpu(cpu, &watchdog_cpus); + return 0; +} + +static void watchdog_nmi_disable(unsigned int cpu) +{ + unsigned int next_cpu = watchdog_next_cpu(cpu); + + /* + * Offlining this cpu will cause the cpu before this one to start + * checking the one after this one. If this cpu just finished checking + * the next cpu and updating hrtimer_interrupts_saved, and then the + * previous cpu checks it within one sample period, it will trigger a + * false positive. Touch the watchdog on the next cpu to prevent it. + */ + if (next_cpu < nr_cpu_ids) + per_cpu(watchdog_nmi_touch, next_cpu) = true; + smp_wmb(); + cpumask_clear_cpu(cpu, &watchdog_cpus); +} +#else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 450c21fd0e6e..ef84d9874d03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> +#include <linux/bug.h> #include "workqueue_internal.h" @@ -148,6 +149,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1093,6 +1096,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1395,6 +1400,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2052,6 +2059,7 @@ __acquires(&pool->lock) current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); + BUG_ON(PANIC_CORRUPTION); dump_stack(); } @@ -2167,6 +2175,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2250,6 +2260,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2266,9 +2277,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -3079,6 +3095,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -4318,7 +4335,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5177,6 +5196,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5300,6 +5467,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); |
