diff options
Diffstat (limited to 'kernel')
46 files changed, 766 insertions, 653 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..34f690b9213a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) @@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..939945a5649c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/file.h> #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 48f45987dc6c..63f0e495f517 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", oldloginuid, loginuid, oldsessionid, sessionid, !rc); @@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e94c3c189338..b05fc202b548 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -236,6 +237,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } @@ -1649,10 +1653,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1719,15 +1719,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -2010,6 +2001,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root = NULL; @@ -2026,6 +2018,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2033,15 +2036,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2152,9 +2146,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2197,6 +2192,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2677,45 +2678,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - if (capable(CAP_SYS_NICE)) - return 0; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } - - return 0; -} - -static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - int i; - int ret; - - for_each_css(css, i, cgrp) { - if (css->ss->allow_attach) { - ret = css->ss->allow_attach(tset); - if (ret) - return ret; - } else { - return -EACCES; - } - } - - return 0; -} - static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2730,24 +2692,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - /* - * if the default permission check fails, give each - * cgroup a chance to extend the permission check - */ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct css_set *cset; - cset = task_css_set(task); - list_add(&cset->mg_node, &tset.src_csets); - ret = cgroup_allow_attach(dst_cgrp, &tset); - list_del(&tset.src_csets); - if (ret) - ret = -EACCES; - } + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; @@ -5447,6 +5394,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 25cfcc804077..2432cc630ffb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -185,10 +185,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -616,7 +623,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e3c0f38acbe6..29c7240172d3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2095,21 +2095,18 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } -static int cpuset_allow_attach(struct cgroup_taskset *tset) +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task, void *priv) { - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if ((current != task) && !capable(CAP_SYS_ADMIN) && - cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) - return -EACCES; - } + if (task_css_is_root(task, cpuset_cgrp_id)) + return; - return 0; + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; } struct cgroup_subsys cpuset_cgrp_subsys = { @@ -2118,11 +2115,11 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, - .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 424961e5bd80..8d2482d77c04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1554,12 +1554,33 @@ static int __init perf_workqueue_init(void) core_initcall(perf_workqueue_init); -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { diff --git a/kernel/fork.c b/kernel/fork.c index a46ce4505066..7de03658692b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -769,6 +769,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) EXPORT_SYMBOL(get_mm_exe_file); /** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + +/** * get_task_mm - acquire a reference to the task's mm * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning @@ -884,14 +907,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 15206453b12a..9812d9c0d483 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -836,7 +836,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT | + IRQD_AFFINITY_MANAGED); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) @@ -845,6 +846,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); + if (irq_settings_has_affinity_managed_set(desc)) + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..a4775f3451b9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, } EXPORT_SYMBOL_GPL(irq_map_generic_chip); +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} + struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..56afc0be6289 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5cb153a8474a..e5c70dcb7f8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) } /** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + +/** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed * diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4b21779d5163..cd6009006510 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -298,6 +298,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..a24c5b909047 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 320579d89091..f0964f058521 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, + _IRQ_AFFINITY_MANAGED = IRQ_AFFINITY_MANAGED, }; #define IRQ_PER_CPU GOT_YOU_MORON @@ -32,6 +33,7 @@ enum { #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON +#define IRQ_AFFINITY_MANAGED GOT_YOU_MORON static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) @@ -65,6 +67,16 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) return desc->status_use_accessors & _IRQ_NO_BALANCING; } +static inline void irq_settings_set_affinity_managed(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_AFFINITY_MANAGED; +} + +static inline bool irq_settings_has_affinity_managed_set(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_AFFINITY_MANAGED; +} + static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0028d2..6030efd4a188 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..b17263be9082 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -526,8 +526,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -536,12 +539,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7dd5718836e..3124cebaec31 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -299,12 +299,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..27946975eff0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..efe1b3b17c88 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 582b66e882ce..e6eceb0aa496 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -597,7 +597,11 @@ void pm_qos_add_request(struct pm_qos_request *req, case PM_QOS_REQ_AFFINE_IRQ: if (irq_can_set_affinity(req->irq)) { struct irq_desc *desc = irq_to_desc(req->irq); - struct cpumask *mask = desc->irq_data.common->affinity; + struct cpumask *mask; + + if (!desc) + return; + mask = desc->irq_data.common->affinity; /* Get the current affinity */ cpumask_copy(&req->cpus_affine, mask); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..f155c62f1f2c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 12cd989dadf6..160e1006640d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -37,6 +37,14 @@ #define HIBERNATE_SIG "S1SUSPEND" /* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + +/* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. @@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio) if (bio_data_dir(bio) == WRITE) put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_error && !hb->error) hb->error = bio->bi_error; @@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); + clean_pages_on_read = true; printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data) d->unc_len = LZO_UNC_SIZE; d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, d->unc, &d->unc_len); + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + atomic_set(&d->stop, 1); wake_up(&d->done); } @@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle, } memset(crc, 0, offsetof(struct crc_data, go)); + clean_pages_on_decompress = true; + /* * Start the decompression threads. */ diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..2cb46d51d715 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include <linux/trace_events.h> #include <linux/suspend.h> +#include <soc/qcom/watchdog.h> + #include "tree.h" #include "rcu.h" @@ -1298,6 +1300,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce watchdog bite */ + msm_trigger_wdog_bite(); +#endif + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1333,6 +1340,11 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce non secure watchdog bite to collect context */ + msm_trigger_wdog_bite(); +#endif + /* * Attempt to revive the RCU machinery by forcing a context switch. * diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..32cbe72bf545 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2275,6 +2275,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c index fcfda385b74a..5bdd51b1e55e 100644 --- a/kernel/sched/boost.c +++ b/kernel/sched/boost.c @@ -156,9 +156,6 @@ void sched_boost_parse_dt(void) struct device_node *sn; const char *boost_policy; - if (!sched_enable_hmp) - return; - sn = of_find_node_by_path("/sched-hmp"); if (!sn) return; @@ -175,9 +172,6 @@ int sched_set_boost(int type) { int ret = 0; - if (!sched_enable_hmp) - return -EINVAL; - mutex_lock(&boost_mutex); if (verify_boost_params(sysctl_sched_boost, type)) @@ -197,9 +191,6 @@ int sched_boost_handler(struct ctl_table *table, int write, unsigned int *data = (unsigned int *)table->data; unsigned int old_val; - if (!sched_enable_hmp) - return -EINVAL; - mutex_lock(&boost_mutex); old_val = *data; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 519aee32e122..312ffdad034a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2044,6 +2044,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -2321,6 +2343,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -3025,8 +3051,9 @@ void sched_exec(void) unsigned long flags; int dest_cpu, curr_cpu; - if (sched_enable_hmp) - return; +#ifdef CONFIG_SCHED_HMP + return; +#endif raw_spin_lock_irqsave(&p->pi_lock, flags); curr_cpu = task_cpu(p); @@ -3734,7 +3761,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3762,11 +3789,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3784,7 +3815,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3792,7 +3823,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3807,7 +3838,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4163,6 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -4345,17 +4377,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4365,15 +4394,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -8215,8 +8243,9 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - if (sched_enable_hmp) - pr_info("HMP scheduling enabled.\n"); +#ifdef CONFIG_SCHED_HMP + pr_info("HMP scheduling enabled.\n"); +#endif BUG_ON(num_possible_cpus() > BITS_PER_LONG); @@ -8362,6 +8391,7 @@ void __init sched_init(void) rq->cluster = &init_cluster; rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); rq->old_busy_time = 0; rq->old_estimated_time = 0; rq->old_busy_time_group = 0; @@ -8704,7 +8734,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -8728,7 +8758,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -9520,7 +9550,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index aac12bfc2ae6..1e3accddd103 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. +/* Copyright (c) 2014-2017, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -36,7 +36,7 @@ struct cluster_data { cpumask_t cpu_mask; unsigned int need_cpus; unsigned int task_thres; - s64 last_isolate_ts; + s64 need_ts; struct list_head lru; bool pending; spinlock_t pending_lock; @@ -277,9 +277,6 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf) for_each_possible_cpu(cpu) { c = &per_cpu(cpu_state, cpu); - if (!c->cluster) - continue; - cluster = c->cluster; if (!cluster || !cluster->inited) continue; @@ -301,6 +298,9 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf) count += snprintf(buf + count, PAGE_SIZE - count, "\tIs busy: %u\n", c->is_busy); count += snprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += snprintf(buf + count, PAGE_SIZE - count, "\tNr running: %u\n", cluster->nrrun); count += snprintf(buf + count, PAGE_SIZE - count, "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); @@ -323,13 +323,14 @@ static ssize_t store_not_preferred(struct cluster_data *state, int ret; ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); - if (ret != 1 && ret != state->num_cpus) + if (ret != state->num_cpus) return -EINVAL; - i = 0; spin_lock_irqsave(&state_lock, flags); - list_for_each_entry(c, &state->lru, sib) - c->not_preferred = val[i++]; + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + } spin_unlock_irqrestore(&state_lock, flags); return count; @@ -340,11 +341,14 @@ static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) struct cpu_data *c; ssize_t count = 0; unsigned long flags; + int i; spin_lock_irqsave(&state_lock, flags); - list_for_each_entry(c, &state->lru, sib) - count += snprintf(buf + count, PAGE_SIZE - count, - "\tCPU:%d %u\n", c->cpu, c->not_preferred); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } spin_unlock_irqrestore(&state_lock, flags); return count; @@ -549,6 +553,7 @@ static bool eval_need(struct cluster_data *cluster) bool need_flag = false; unsigned int active_cpus; unsigned int new_need; + s64 now; if (unlikely(!cluster->inited)) return 0; @@ -573,9 +578,10 @@ static bool eval_need(struct cluster_data *cluster) need_flag = adjustment_possible(cluster, new_need); last_need = cluster->need_cpus; - cluster->need_cpus = new_need; + now = ktime_to_ms(ktime_get()); - if (!need_flag) { + if (new_need == last_need) { + cluster->need_ts = now; spin_unlock_irqrestore(&state_lock, flags); return 0; } @@ -583,12 +589,15 @@ static bool eval_need(struct cluster_data *cluster) if (need_cpus > cluster->active_cpus) { ret = 1; } else if (need_cpus < cluster->active_cpus) { - s64 now = ktime_to_ms(ktime_get()); - s64 elapsed = now - cluster->last_isolate_ts; + s64 elapsed = now - cluster->need_ts; ret = elapsed >= cluster->offline_delay_ms; } + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus, ret && need_flag); spin_unlock_irqrestore(&state_lock, flags); @@ -746,7 +755,6 @@ static void try_to_isolate(struct cluster_data *cluster, unsigned int need) if (!sched_isolate_cpu(c->cpu)) { c->isolated_by_us = true; move_cpu_lru(c); - cluster->last_isolate_ts = ktime_to_ms(ktime_get()); } else { pr_debug("Unable to isolate CPU%u\n", c->cpu); } @@ -779,7 +787,6 @@ static void try_to_isolate(struct cluster_data *cluster, unsigned int need) if (!sched_isolate_cpu(c->cpu)) { c->isolated_by_us = true; move_cpu_lru(c); - cluster->last_isolate_ts = ktime_to_ms(ktime_get()); } else { pr_debug("Unable to isolate CPU%u\n", c->cpu); } diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f6f9b9b3a4a8..d751bc2d0d6e 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); goto err; @@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + sysfs_remove_group(&policy->kobj, get_sysfs_attr()); policy->governor_data = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c52655581c4c..cf55fc2663fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2620,7 +2620,6 @@ struct cpu_select_env { u8 need_idle:1; u8 need_waker_cluster:1; u8 sync:1; - u8 ignore_prev_cpu:1; enum sched_boost_policy boost_policy; u8 pack_task:1; int prev_cpu; @@ -2630,6 +2629,7 @@ struct cpu_select_env { u64 cpu_load; u32 sbc_best_flag; u32 sbc_best_cluster_flag; + struct cpumask search_cpus; }; struct cluster_cpu_stats { @@ -2834,11 +2834,14 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats) { struct sched_cluster *next = NULL; int i; + struct cpumask search_cpus; while (!bitmap_empty(env->backup_list, num_clusters)) { next = next_candidate(env->backup_list, 0, num_clusters); __clear_bit(next->id, env->backup_list); - for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) { + + cpumask_and(&search_cpus, &env->search_cpus, &next->cpus); + for_each_cpu(i, &search_cpus) { trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), sched_irqload(i), power_cost(i, task_load(env->p) + cpu_cravg_sync(i, env->sync)), 0); @@ -3010,11 +3013,7 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, int i; struct cpumask search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus); - cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); - - if (env->ignore_prev_cpu) - cpumask_clear_cpu(env->prev_cpu, &search_cpus); + cpumask_and(&search_cpus, &env->search_cpus, &c->cpus); env->need_idle = wake_to_idle(env->p) || c->wake_up_idle; @@ -3026,7 +3025,7 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c, power_cost(i, task_load(env->p) + cpu_cravg_sync(i, env->sync)), 0); - if (unlikely(!cpu_active(i)) || skip_cpu(i, env)) + if (skip_cpu(i, env)) continue; update_spare_capacity(stats, env, i, c->capacity, @@ -3081,9 +3080,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) return false; prev_cpu = env->prev_cpu; - if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) || - unlikely(!cpu_active(prev_cpu)) || - cpu_isolated(prev_cpu)) + if (!cpumask_test_cpu(prev_cpu, &env->search_cpus)) return false; if (task->ravg.mark_start - task->last_cpu_selected_ts >= @@ -3116,7 +3113,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) spill_threshold_crossed(env, cpu_rq(prev_cpu))) { update_spare_capacity(stats, env, prev_cpu, cluster->capacity, env->cpu_load); - env->ignore_prev_cpu = 1; + cpumask_clear_cpu(prev_cpu, &env->search_cpus); return false; } @@ -3132,23 +3129,17 @@ wake_to_waker_cluster(struct cpu_select_env *env) } static inline bool -bias_to_waker_cpu(struct task_struct *p, int cpu) +bias_to_waker_cpu(struct cpu_select_env *env, int cpu) { return sysctl_sched_prefer_sync_wakee_to_waker && cpu_rq(cpu)->nr_running == 1 && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) && - cpu_active(cpu) && !cpu_isolated(cpu); + cpumask_test_cpu(cpu, &env->search_cpus); } static inline int -cluster_allowed(struct task_struct *p, struct sched_cluster *cluster) +cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster) { - cpumask_t tmp_mask; - - cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask); - cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed); - - return !cpumask_empty(&tmp_mask); + return cpumask_intersects(&env->search_cpus, &cluster->cpus); } /* return cheapest cpu that can fit this task */ @@ -3169,7 +3160,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .need_waker_cluster = 0, .sync = sync, .prev_cpu = target, - .ignore_prev_cpu = 0, .rtg = NULL, .sbc_best_flag = 0, .sbc_best_cluster_flag = 0, @@ -3182,6 +3172,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); bitmap_zero(env.backup_list, NR_CPUS); + cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask); + cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask); + init_cluster_cpu_stats(&stats); special = env_has_special_flags(&env); @@ -3191,19 +3184,19 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, if (grp && grp->preferred_cluster) { pref_cluster = grp->preferred_cluster; - if (!cluster_allowed(p, pref_cluster)) + if (!cluster_allowed(&env, pref_cluster)) clear_bit(pref_cluster->id, env.candidate_list); else env.rtg = grp; } else if (!special) { cluster = cpu_rq(cpu)->cluster; if (wake_to_waker_cluster(&env)) { - if (bias_to_waker_cpu(p, cpu)) { + if (bias_to_waker_cpu(&env, cpu)) { target = cpu; sbc_flag = SBC_FLAG_WAKER_CLUSTER | SBC_FLAG_WAKER_CPU; goto out; - } else if (cluster_allowed(p, cluster)) { + } else if (cluster_allowed(&env, cluster)) { env.need_waker_cluster = 1; bitmap_zero(env.candidate_list, NR_CPUS); __set_bit(cluster->id, env.candidate_list); @@ -3332,9 +3325,9 @@ void _inc_hmp_sched_stats_fair(struct rq *rq, * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on * efficiency by short-circuiting for_each_sched_entity() loop when - * !sched_enable_hmp || sched_disable_window_stats + * sched_disable_window_stats */ - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; for_each_sched_entity(se) { @@ -3357,7 +3350,7 @@ _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) struct sched_entity *se = &p->se; /* See comment on efficiency in _inc_hmp_sched_stats_fair */ - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; for_each_sched_entity(se) { @@ -3482,8 +3475,7 @@ static inline int migration_needed(struct task_struct *p, int cpu) int nice; struct related_thread_group *grp; - if (!sched_enable_hmp || p->state != TASK_RUNNING || - p->nr_cpus_allowed == 1) + if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1) return 0; /* No need to migrate task that is about to be throttled */ @@ -3500,8 +3492,15 @@ static inline int migration_needed(struct task_struct *p, int cpu) nice = task_nice(p); rcu_read_lock(); grp = task_related_thread_group(p); + /* + * Don't assume higher capacity means higher power. If the task + * is running on the power efficient CPU, avoid migrating it + * to a lower capacity cluster. + */ if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE || - upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) { + upmigrate_discouraged(p)) && + cpu_capacity(cpu) > min_capacity && + cpu_max_power_cost(cpu) == max_power_cost) { rcu_read_unlock(); return DOWN_MIGRATION; } @@ -6846,17 +6845,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if(prefer_idle) { - // Find a target cpu with lowest - // utilization. + if (prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ if (target_util == 0 || target_util < new_util) { target_cpu = i; target_util = new_util; } } else { - // Find a target cpu with highest - // utilization. + /* Find a target cpu with lowest + * utilization. + */ if (target_util == 0 || target_util > new_util) { target_cpu = i; @@ -7024,8 +7025,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sched_enable_hmp) - return select_best_cpu(p, prev_cpu, 0, sync); +#ifdef CONFIG_SCHED_HMP + return select_best_cpu(p, prev_cpu, 0, sync); +#endif if (sd_flag & SD_BALANCE_WAKE) want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && @@ -7940,10 +7942,11 @@ static int detach_tasks(struct lb_env *env) if (env->imbalance <= 0) return 0; + if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) env->flags |= LBF_IGNORE_BIG_TASKS; - else if (!same_cluster(env->dst_cpu, env->src_cpu)) - env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; redo: while (!list_empty(tasks)) { @@ -8381,7 +8384,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) mcc->cpu = cpu; #ifdef CONFIG_SCHED_DEBUG raw_spin_unlock_irqrestore(&mcc->lock, flags); - pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); goto skip_unlock; #endif } @@ -9312,8 +9316,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - if (sched_enable_hmp) - return find_busiest_queue_hmp(env, group); +#ifdef CONFIG_SCHED_HMP + return find_busiest_queue_hmp(env, group); +#endif for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; @@ -10119,8 +10124,9 @@ static inline int find_new_ilb(int type) { int ilb; - if (sched_enable_hmp) - return find_new_hmp_ilb(type); +#ifdef CONFIG_SCHED_HMP + return find_new_hmp_ilb(type); +#endif ilb = cpumask_first(nohz.idle_cpus_mask); @@ -10495,8 +10501,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) if (likely(!atomic_read(&nohz.nr_cpus))) return 0; - if (sched_enable_hmp) - return _nohz_kick_needed_hmp(rq, cpu, type); +#ifdef CONFIG_SCHED_HMP + return _nohz_kick_needed_hmp(rq, cpu, type); +#endif if (time_before(now, nohz.next_balance)) return 0; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7cc74e56fde4..c30c48fde7e6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) * Energy aware scheduling. Use platform energy model to guide scheduling * decisions optimizing for energy efficiency. */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE +SCHED_FEAT(ENERGY_AWARE, true) +#else SCHED_FEAT(ENERGY_AWARE, false) +#endif diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 95125c5518e2..df47c26ab6d2 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -28,8 +28,7 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"}; -const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", - "RQ_TO_RQ", "GROUP_TO_GROUP"}; +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"}; static ktime_t ktime_last; static bool sched_ktime_suspended; @@ -456,6 +455,12 @@ compare_clusters(void *priv, struct list_head *a, struct list_head *b) cluster1 = container_of(a, struct sched_cluster, list); cluster2 = container_of(b, struct sched_cluster, list); + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ ret = cluster1->max_power_cost > cluster2->max_power_cost || (cluster1->max_power_cost == cluster2->max_power_cost && cluster1->max_possible_capacity < @@ -616,19 +621,6 @@ int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) return 0; } -int __init set_sched_enable_hmp(char *str) -{ - int enable_hmp = 0; - - get_option(&str, &enable_hmp); - - sched_enable_hmp = !!enable_hmp; - - return 0; -} - -early_param("sched_enable_hmp", set_sched_enable_hmp); - /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { @@ -726,7 +718,7 @@ __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); unsigned int __read_mostly sysctl_sched_enable_thread_grouping; -__read_mostly unsigned int sysctl_sched_new_task_windows = 5; +#define SCHED_NEW_TASK_WINDOWS 5 #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 @@ -870,9 +862,6 @@ unsigned int max_task_load(void) return sched_ravg_window; } -/* Use this knob to turn on or off HMP-aware task placement logic */ -unsigned int __read_mostly sched_enable_hmp; - /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || @@ -970,8 +959,8 @@ unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; unsigned int __read_mostly sysctl_sched_short_burst; unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC; -static void -_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) +static void _update_up_down_migrate(unsigned int *up_migrate, + unsigned int *down_migrate, bool is_group) { unsigned int delta; @@ -985,7 +974,8 @@ _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) *up_migrate >>= 10; *up_migrate *= NSEC_PER_USEC; - *up_migrate = min(*up_migrate, sched_ravg_window); + if (!is_group) + *up_migrate = min(*up_migrate, sched_ravg_window); *down_migrate /= NSEC_PER_USEC; *down_migrate *= up_down_migrate_scale_factor; @@ -1000,14 +990,14 @@ static void update_up_down_migrate(void) unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); - _update_up_down_migrate(&up_migrate, &down_migrate); + _update_up_down_migrate(&up_migrate, &down_migrate, false); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); - _update_up_down_migrate(&up_migrate, &down_migrate); + _update_up_down_migrate(&up_migrate, &down_migrate, true); sched_group_upmigrate = up_migrate; sched_group_downmigrate = down_migrate; } @@ -1245,7 +1235,7 @@ unlock: void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; if (is_big_task(p)) @@ -1254,7 +1244,7 @@ void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; if (is_big_task(p)) @@ -1323,7 +1313,7 @@ void fixup_nr_big_tasks(struct hmp_sched_stats *stats, u64 new_task_load; u64 old_task_load; - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); @@ -1433,9 +1423,6 @@ int sched_window_update_handler(struct ctl_table *table, int write, unsigned int *data = (unsigned int *)table->data; unsigned int old_val; - if (!sched_enable_hmp) - return -EINVAL; - mutex_lock(&policy_mutex); old_val = *data; @@ -1471,9 +1458,6 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, unsigned int *data = (unsigned int *)table->data; int update_task_count = 0; - if (!sched_enable_hmp) - return 0; - /* * The policy mutex is acquired with cpu_hotplug.lock * held from cpu_up()->cpufreq_governor_interactive()-> @@ -1713,45 +1697,19 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load) return freq; } -static inline struct group_cpu_time * -_group_cpu_time(struct related_thread_group *grp, int cpu); - -/* - * Return load from all related group in given cpu. - * Caller must ensure that related_thread_group_lock is held. - */ -static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) -{ - struct related_thread_group *grp; - - for_each_related_thread_group(grp) { - struct group_cpu_time *cpu_time; - - cpu_time = _group_cpu_time(grp, cpu); - *grp_load += cpu_time->prev_runnable_sum; - if (new_grp_load) - *new_grp_load += cpu_time->nt_prev_runnable_sum; - } -} - /* * Return load from all related groups in given frequency domain. - * Caller must ensure that related_thread_group_lock is held. */ static void group_load_in_freq_domain(struct cpumask *cpus, u64 *grp_load, u64 *new_grp_load) { - struct related_thread_group *grp; int j; - for_each_related_thread_group(grp) { - for_each_cpu(j, cpus) { - struct group_cpu_time *cpu_time; + for_each_cpu(j, cpus) { + struct rq *rq = cpu_rq(j); - cpu_time = _group_cpu_time(grp, j); - *grp_load += cpu_time->prev_runnable_sum; - *new_grp_load += cpu_time->nt_prev_runnable_sum; - } + *grp_load += rq->grp_time.prev_runnable_sum; + *new_grp_load += rq->grp_time.nt_prev_runnable_sum; } } @@ -1776,9 +1734,6 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups) int rc = 0; u64 group_load = 0, new_load = 0; - if (!sched_enable_hmp) - return 0; - if (check_pred) { u64 prev = rq->old_busy_time; u64 predicted = rq->hmp_stats.pred_demands_sum; @@ -1796,20 +1751,18 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups) if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) return 0; } else { - read_lock_irqsave(&related_thread_group_lock, flags); /* * Protect from concurrent update of rq->prev_runnable_sum and * group cpu load */ - raw_spin_lock(&rq->lock); + raw_spin_lock_irqsave(&rq->lock, flags); if (check_groups) - _group_load_in_cpu(cpu_of(rq), &group_load, NULL); + group_load = rq->grp_time.prev_runnable_sum; new_load = rq->prev_runnable_sum + group_load; new_load = freq_policy_load(rq, new_load); - raw_spin_unlock(&rq->lock); - read_unlock_irqrestore(&related_thread_group_lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); cur_freq = load_to_freq(rq, rq->old_busy_time); freq_required = load_to_freq(rq, new_load); @@ -1897,7 +1850,7 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, static inline bool is_new_task(struct task_struct *p) { - return p->ravg.active_windows < sysctl_sched_new_task_windows; + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; } #define INC_STEP 8 @@ -2283,6 +2236,31 @@ static void rollover_task_window(struct task_struct *p, bool full_window) } } +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + u64 grp_curr_sum = rq->grp_time.curr_runnable_sum; + u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + grp_curr_sum = 0; + grp_nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + rq->grp_time.prev_runnable_sum = grp_curr_sum; + rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + rq->grp_time.curr_runnable_sum = 0; + rq->grp_time.nt_curr_runnable_sum = 0; +} + /* * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) */ @@ -2299,8 +2277,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, u64 *prev_runnable_sum = &rq->prev_runnable_sum; u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; - int flip_counters = 0; - int prev_sum_reset = 0; bool new_task; struct related_thread_group *grp; int cpu = rq->cpu; @@ -2315,51 +2291,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, new_task = is_new_task(p); - grp = p->grp; - if (grp && sched_freq_aggregate) { - /* cpu_time protected by rq_lock */ - struct group_cpu_time *cpu_time = - _group_cpu_time(grp, cpu_of(rq)); - - curr_runnable_sum = &cpu_time->curr_runnable_sum; - prev_runnable_sum = &cpu_time->prev_runnable_sum; - - nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - - if (cpu_time->window_start != rq->window_start) { - int nr_windows; - - delta = rq->window_start - cpu_time->window_start; - nr_windows = div64_u64(delta, window_size); - if (nr_windows > 1) - prev_sum_reset = 1; - - cpu_time->window_start = rq->window_start; - flip_counters = 1; - } - - if (p_is_curr_task && new_window) { - u64 curr_sum = rq->curr_runnable_sum; - u64 nt_curr_sum = rq->nt_curr_runnable_sum; - - if (full_window) - curr_sum = nt_curr_sum = 0; - - rq->prev_runnable_sum = curr_sum; - rq->nt_prev_runnable_sum = nt_curr_sum; - - rq->curr_runnable_sum = 0; - rq->nt_curr_runnable_sum = 0; - } - } else { - if (p_is_curr_task && new_window) { - flip_counters = 1; - if (full_window) - prev_sum_reset = 1; - } - } - /* * Handle per-task window rollover. We don't care about the idle * task or exiting tasks. @@ -2369,26 +2300,25 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, rollover_task_window(p, full_window); } - if (flip_counters) { - u64 curr_sum = *curr_runnable_sum; - u64 nt_curr_sum = *nt_curr_runnable_sum; + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + rollover_top_tasks(rq, full_window); + } - if (prev_sum_reset) - curr_sum = nt_curr_sum = 0; + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; - *prev_runnable_sum = curr_sum; - *nt_prev_runnable_sum = nt_curr_sum; + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time = &rq->grp_time; - *curr_runnable_sum = 0; - *nt_curr_runnable_sum = 0; + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; - if (p_is_curr_task) - rollover_top_tasks(rq, full_window); + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; } - if (!account_busy_for_cpu_time(rq, p, irqtime, event)) - goto done; - if (!new_window) { /* * account_busy_for_cpu_time() = 1 so busy time needs @@ -2864,7 +2794,7 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq, } static inline void -update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime) +update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime) { /* * update_task_demand() has checks for idle task and @@ -2905,7 +2835,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, done: trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, rq->cc.cycles, rq->cc.time, - _group_cpu_time(p->grp, cpu_of(rq))); + p->grp ? &rq->grp_time : NULL); p->ravg.mark_start = wallclock; } @@ -3012,7 +2942,7 @@ void set_window_start(struct rq *rq) { static int sync_cpu_available; - if (rq->window_start || !sched_enable_hmp) + if (rq->window_start) return; if (!sync_cpu_available) { @@ -3063,7 +2993,6 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) u64 start_ts = sched_ktime_clock(); int reason = WINDOW_CHANGE; unsigned int old = 0, new = 0; - struct related_thread_group *grp; local_irq_save(flags); @@ -3081,19 +3010,6 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) read_unlock(&tasklist_lock); - list_for_each_entry(grp, &active_related_thread_groups, list) { - int j; - - for_each_possible_cpu(j) { - struct group_cpu_time *cpu_time; - /* Protected by rq lock */ - cpu_time = _group_cpu_time(grp, j); - memset(cpu_time, 0, sizeof(struct group_cpu_time)); - if (window_start) - cpu_time->window_start = window_start; - } - } - if (window_size) { sched_ravg_window = window_size * TICK_NSEC; set_hmp_defaults(); @@ -3109,6 +3025,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) rq->window_start = window_start; rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { memset(&rq->load_subs[i], 0, sizeof(struct load_subtractions)); @@ -3198,15 +3115,12 @@ static inline u64 freq_policy_load(struct rq *rq, u64 load) case FREQ_REPORT_CPU_LOAD: break; default: - WARN_ON_ONCE(1); + break; } return load; } -static inline void -sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); - void sched_get_cpus_busy(struct sched_load *busy, const struct cpumask *query_cpus) { @@ -3223,7 +3137,6 @@ void sched_get_cpus_busy(struct sched_load *busy, unsigned int window_size; u64 max_prev_sum = 0; int max_busy_cpu = cpumask_first(query_cpus); - struct related_thread_group *grp; u64 total_group_load = 0, total_ngload = 0; bool aggregate_load = false; struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus)); @@ -3233,8 +3146,6 @@ void sched_get_cpus_busy(struct sched_load *busy, local_irq_save(flags); - read_lock(&related_thread_group_lock); - /* * This function could be called in timer context, and the * current task may have been executing for a long time. Ensure @@ -3287,15 +3198,6 @@ void sched_get_cpus_busy(struct sched_load *busy, raw_spin_unlock(&cluster->load_lock); - for_each_related_thread_group(grp) { - for_each_cpu(cpu, query_cpus) { - /* Protected by rq_lock */ - struct group_cpu_time *cpu_time = - _group_cpu_time(grp, cpu); - sync_window_start(cpu_rq(cpu), cpu_time); - } - } - group_load_in_freq_domain( &cpu_rq(max_busy_cpu)->freq_domain_cpumask, &total_group_load, &total_ngload); @@ -3316,7 +3218,8 @@ void sched_get_cpus_busy(struct sched_load *busy, ngload[i] = total_ngload; } } else { - _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); + group_load[i] = rq->grp_time.prev_runnable_sum; + ngload[i] = rq->grp_time.nt_prev_runnable_sum; } load[i] += group_load[i]; @@ -3341,8 +3244,6 @@ skip_early: for_each_cpu(cpu, query_cpus) raw_spin_unlock(&(cpu_rq(cpu))->lock); - read_unlock(&related_thread_group_lock); - local_irq_restore(flags); i = 0; @@ -3373,7 +3274,9 @@ exit_early: trace_sched_get_busy(cpu, busy[i].prev_load, busy[i].new_task_load, busy[i].predicted_load, - early_detection[i]); + early_detection[i], + aggregate_load && + cpu == max_busy_cpu); i++; } } @@ -3620,7 +3523,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) bool new_task; struct related_thread_group *grp; - if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING)) + if (!p->on_rq && p->state != TASK_WAKING) return; if (exiting_task(p)) { @@ -3659,18 +3562,17 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) if (grp && sched_freq_aggregate) { struct group_cpu_time *cpu_time; - cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); + cpu_time = &src_rq->grp_time; src_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); + cpu_time = &dest_rq->grp_time; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - sync_window_start(dest_rq, cpu_time); if (p->ravg.curr_window) { *src_curr_runnable_sum -= p->ravg.curr_window; @@ -3799,61 +3701,6 @@ void set_preferred_cluster(struct related_thread_group *grp) #define DEFAULT_CGROUP_COLOC_ID 1 -static inline void free_group_cputime(struct related_thread_group *grp) -{ - free_percpu(grp->cpu_time); -} - -static int alloc_group_cputime(struct related_thread_group *grp) -{ - int i; - struct group_cpu_time *cpu_time; - int cpu = raw_smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - u64 window_start = rq->window_start; - - grp->cpu_time = alloc_percpu_gfp(struct group_cpu_time, GFP_ATOMIC); - if (!grp->cpu_time) - return -ENOMEM; - - for_each_possible_cpu(i) { - cpu_time = per_cpu_ptr(grp->cpu_time, i); - memset(cpu_time, 0, sizeof(struct group_cpu_time)); - cpu_time->window_start = window_start; - } - - return 0; -} - -/* - * A group's window_start may be behind. When moving it forward, flip prev/curr - * counters. When moving forward > 1 window, prev counter is set to 0 - */ -static inline void -sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) -{ - u64 delta; - int nr_windows; - u64 curr_sum = cpu_time->curr_runnable_sum; - u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; - - delta = rq->window_start - cpu_time->window_start; - if (!delta) - return; - - nr_windows = div64_u64(delta, sched_ravg_window); - if (nr_windows > 1) - curr_sum = nt_curr_sum = 0; - - cpu_time->prev_runnable_sum = curr_sum; - cpu_time->curr_runnable_sum = 0; - - cpu_time->nt_prev_runnable_sum = nt_curr_sum; - cpu_time->nt_curr_runnable_sum = 0; - - cpu_time->window_start = rq->window_start; -} - /* * Task's cpu usage is accounted in: * rq->curr/prev_runnable_sum, when its ->grp is NULL @@ -3871,7 +3718,6 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; - struct migration_sum_data d; int migrate_type; int cpu = cpu_of(rq); bool new_task; @@ -3886,15 +3732,10 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); new_task = is_new_task(p); - /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ - cpu_time = _group_cpu_time(grp, cpu); + cpu_time = &rq->grp_time; if (event == ADD_TASK) { - sync_window_start(rq, cpu_time); migrate_type = RQ_TO_GROUP; - d.src_rq = rq; - d.src_cpu_time = NULL; - d.dst_rq = NULL; - d.dst_cpu_time = cpu_time; + src_curr_runnable_sum = &rq->curr_runnable_sum; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &rq->prev_runnable_sum; @@ -3919,17 +3760,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, } else { migrate_type = GROUP_TO_RQ; - d.src_rq = NULL; - d.src_cpu_time = cpu_time; - d.dst_rq = rq; - d.dst_cpu_time = NULL; - /* - * In case of REM_TASK, cpu_time->window_start would be - * uptodate, because of the update_task_ravg() we called - * above on the moving task. Hence no need for - * sync_window_start() - */ src_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_curr_runnable_sum = &rq->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; @@ -3975,7 +3806,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; - trace_sched_migration_update_sum(p, migrate_type, &d); + trace_sched_migration_update_sum(p, migrate_type, rq); BUG_ON((s64)*src_curr_runnable_sum < 0); BUG_ON((s64)*src_prev_runnable_sum < 0); @@ -3983,18 +3814,6 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, BUG_ON((s64)*src_nt_prev_runnable_sum < 0); } -static inline struct group_cpu_time * -task_group_cpu_time(struct task_struct *p, int cpu) -{ - return _group_cpu_time(rcu_dereference(p->grp), cpu); -} - -static inline struct group_cpu_time * -_group_cpu_time(struct related_thread_group *grp, int cpu) -{ - return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; -} - static inline struct related_thread_group* lookup_related_thread_group(unsigned int group_id) { @@ -4014,12 +3833,6 @@ int alloc_related_thread_groups(void) goto err; } - if (alloc_group_cputime(grp)) { - kfree(grp); - ret = -ENOMEM; - goto err; - } - grp->id = i; INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); @@ -4034,7 +3847,6 @@ err: for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { grp = lookup_related_thread_group(i); if (grp) { - free_group_cputime(grp); kfree(grp); related_thread_groups[i] = NULL; } else { @@ -4418,9 +4230,6 @@ static int register_sched_callback(void) { int ret; - if (!sched_enable_hmp) - return 0; - ret = cpufreq_register_notifier(¬ifier_policy_block, CPUFREQ_POLICY_NOTIFIER); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3fe00d6fa335..07b2c63e4983 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -441,7 +441,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -487,8 +487,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -504,7 +504,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -521,7 +521,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1257,7 +1257,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1270,26 +1293,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1298,7 +1332,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1311,31 +1345,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1351,7 +1385,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1363,7 +1397,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); @@ -1406,6 +1440,7 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +#ifdef CONFIG_SCHED_HMP static int select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) { @@ -1419,6 +1454,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) return cpu; } +#endif static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) @@ -1426,8 +1462,9 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sched_enable_hmp) - return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); +#ifdef CONFIG_SCHED_HMP + return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); +#endif /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1796,14 +1833,6 @@ static int find_lowest_rq_hmp(struct task_struct *task) return best_cpu; } - -#else /* CONFIG_SCHED_HMP */ - -static int find_lowest_rq_hmp(struct task_struct *task) -{ - return -1; -} - #endif /* CONFIG_SCHED_HMP */ static int find_lowest_rq(struct task_struct *task) @@ -1813,8 +1842,9 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - if (sched_enable_hmp) - return find_lowest_rq_hmp(task); +#ifdef CONFIG_SCHED_HMP + return find_lowest_rq_hmp(task); +#endif /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d907eeb297a3..75500042fd32 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -366,6 +366,13 @@ struct load_subtractions { u64 new_subs; }; +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + struct sched_cluster { raw_spinlock_t load_lock; struct list_head list; @@ -407,12 +414,6 @@ struct related_thread_group { struct sched_cluster *preferred_cluster; struct rcu_head rcu; u64 last_update; - struct group_cpu_time __percpu *cpu_time; /* one per cluster */ -}; - -struct migration_sum_data { - struct rq *src_rq, *dst_rq; - struct group_cpu_time *src_cpu_time, *dst_cpu_time; }; extern struct list_head cluster_head; @@ -776,6 +777,7 @@ struct rq { u64 prev_runnable_sum; u64 nt_curr_runnable_sum; u64 nt_prev_runnable_sum; + struct group_cpu_time grp_time; struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; DECLARE_BITMAP_ARRAY(top_tasks_bitmap, NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES); @@ -1069,10 +1071,6 @@ enum sched_boost_policy { #define WINDOW_STATS_AVG 3 #define WINDOW_STATS_INVALID_POLICY 4 -#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK 0 -#define FREQ_REPORT_CPU_LOAD 1 -#define FREQ_REPORT_TOP_TASK 2 - #define SCHED_UPMIGRATE_MIN_NICE 15 #define EXITING_TASK_MARKER 0xdeaddead @@ -1083,7 +1081,6 @@ enum sched_boost_policy { extern struct mutex policy_mutex; extern unsigned int sched_ravg_window; extern unsigned int sched_disable_window_stats; -extern unsigned int sched_enable_hmp; extern unsigned int max_possible_freq; extern unsigned int min_max_freq; extern unsigned int pct_task_load(struct task_struct *p); @@ -1127,7 +1124,6 @@ extern void update_cluster_topology(void); extern void note_task_waking(struct task_struct *p, u64 wallclock); extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); extern void init_clusters(void); -extern int __init set_sched_enable_hmp(char *str); extern void reset_cpu_hmp_stats(int cpu, int reset_cra); extern unsigned int max_task_load(void); extern void sched_account_irqtime(int cpu, struct task_struct *curr, @@ -1257,7 +1253,7 @@ inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, { u32 task_load; - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; task_load = sched_disable_window_stats ? 0 : p->ravg.demand; @@ -1272,7 +1268,7 @@ dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, { u32 task_load; - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; task_load = sched_disable_window_stats ? 0 : p->ravg.demand; @@ -1290,7 +1286,7 @@ fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats, struct task_struct *p, s64 task_load_delta, s64 pred_demand_delta) { - if (!sched_enable_hmp || sched_disable_window_stats) + if (sched_disable_window_stats) return; stats->cumulative_runnable_avg += task_load_delta; @@ -1350,14 +1346,6 @@ check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); extern void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, struct task_struct *p); -struct group_cpu_time { - u64 curr_runnable_sum; - u64 prev_runnable_sum; - u64 nt_curr_runnable_sum; - u64 nt_prev_runnable_sum; - u64 window_start; -}; - /* Is frequency of two cpus synchronized with each other? */ static inline int same_freq_domain(int src_cpu, int dst_cpu) { @@ -1667,7 +1655,6 @@ static inline int update_preferred_cluster(struct related_thread_group *grp, static inline void add_new_task_to_grp(struct task_struct *new) {} -#define sched_enable_hmp 0 #define PRED_DEMAND_DELTA (0) static inline void @@ -1954,19 +1941,41 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 -#define ENQUEUE_WAKEUP_NEW 0x20 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 +#define ENQUEUE_WAKEUP_NEW 0x40 #define RETRY_TASK ((void *)-1UL) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b2ff383d6062..b0c5fe6d1f3b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -896,7 +896,6 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, - .allow_attach = subsys_cgroup_allow_attach, .attach = schedtune_attach, }; @@ -910,6 +909,7 @@ schedtune_init_cgroups(void) for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); + raw_spin_lock_init(&bg->lock); } pr_info("schedtune: configured to support %d boost groups\n", diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 07b7f84b37e2..2ffb1680b380 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,7 +22,6 @@ #include <linux/syscore_ops.h> #include <linux/cpufreq.h> #include <trace/events/sched.h> -#include <clocksource/arm_arch_timer.h> #include "sched.h" #include "walt.h" @@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock) delta = wallclock - rq->window_start; /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ if (delta < 0) { - if (arch_timer_read_counter() == 0) - delta = 0; - else - BUG_ON(1); + delta = 0; + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n"); } if (delta < walt_ravg_window) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d3873333c766..51eef8e7df39 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,6 +133,7 @@ static int ten_thousand = 10000; #endif #ifdef CONFIG_SCHED_HMP static int one_thousand = 1000; +static int max_freq_reporting_policy = FREQ_REPORT_INVALID_POLICY - 1; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ @@ -178,7 +179,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -297,6 +298,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, + .extra2 = &max_freq_reporting_policy, }, { .procname = "sched_freq_inc_notify", @@ -447,13 +449,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "sched_new_task_windows", - .data = &sysctl_sched_new_task_windows, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_window_update_handler, - }, - { .procname = "sched_pred_alert_freq", .data = &sysctl_sched_pred_alert_freq, .maxlen = sizeof(unsigned int), @@ -591,7 +586,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "sched_shares_window_ns", @@ -2375,6 +2371,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2502,8 +2513,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -3116,6 +3146,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3161,6 +3197,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0cdc34ebd8d1..2af5687b83c9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -412,12 +412,10 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); if (poweron_alarm) { - struct rtc_time tm_val; - unsigned long secs; + uint64_t msec = 0; - tm_val = rtc_ktime_to_tm(min); - rtc_tm_to_time(&tm_val, &secs); - lpm_suspend_wake_time(secs); + msec = ktime_to_ms(min); + lpm_suspend_wake_time(msec); } else { /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b98810d2f3b4..89cc82a38e4d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -108,7 +108,7 @@ static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); -static void clocksource_select(void); +static void clocksource_select(bool force); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -415,7 +415,7 @@ static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); if (__clocksource_watchdog_kthread()) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -555,11 +555,12 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur, + bool force) { struct clocksource *cs; - if (!finished_booting || list_empty(&clocksource_list)) + if ((!finished_booting && !force) || list_empty(&clocksource_list)) return NULL; /* @@ -577,13 +578,13 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) return NULL; } -static void __clocksource_select(bool skipcur) +static void __clocksource_select(bool skipcur, bool force) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot, skipcur); + best = clocksource_find_best(oneshot, skipcur, force); if (!best) return; @@ -623,22 +624,40 @@ static void __clocksource_select(bool skipcur) * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static void clocksource_select(void) +static void clocksource_select(bool force) { - __clocksource_select(false); + return __clocksource_select(false, force); } static void clocksource_select_fallback(void) { - __clocksource_select(true); + __clocksource_select(true, false); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ -static inline void clocksource_select(void) { } + +static inline void clocksource_select(bool force) { } static inline void clocksource_select_fallback(void) { } #endif +/** + * clocksource_select_force - Force re-selection of the best clocksource + * among registered clocksources + * + * clocksource_select() can't select the best clocksource before + * calling clocksource_done_booting() and since clocksource_select() + * should be called with clocksource_mutex held, provide a new API + * can be called from other files to select best clockrouce irrespective + * of finished_booting flag. + */ +void clocksource_select_force(void) +{ + mutex_lock(&clocksource_mutex); + clocksource_select(true); + mutex_unlock(&clocksource_mutex); +} + /* * clocksource_done_booting - Called near the end of core bootup * @@ -655,7 +674,7 @@ static int __init clocksource_done_booting(void) * Run the watchdog first to eliminate unstable clock sources */ __clocksource_watchdog_kthread(); - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -744,6 +763,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + /** * __clocksource_register_scale - Used to install new clocksources * @cs: clocksource to be registered @@ -765,7 +785,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; @@ -788,7 +808,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } @@ -892,7 +912,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4ff237dbc006..ede4bf13d3e9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** @@ -385,8 +402,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += clocksource_delta(tkr->read(tkr->clock), - tkr->cycle_last, tkr->mask); + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tkr->read(tkr->clock), + tkr->cycle_last, + tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; @@ -404,6 +424,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2acad4b6a92a..2963266fb7bf 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,4 +1,8 @@ +# We are fully aware of the dangers of __builtin_return_address() +FRAME_CFLAGS := $(call cc-disable-warning,frame-address) +KBUILD_CFLAGS += $(FRAME_CFLAGS) + # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1d0f1a1ac44c..66d9e907aa07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -895,6 +895,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -1357,11 +1358,11 @@ void tracing_reset_all_online_cpus(void) #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT]; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; + unsigned *saved_tgids; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; @@ -1395,12 +1396,22 @@ static int allocate_cmdlines_buffer(unsigned int val, return -ENOMEM; } + s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids), + GFP_KERNEL); + if (!s->saved_tgids) { + kfree(s->map_cmdline_to_pid); + kfree(s->saved_cmdlines); + return -ENOMEM; + } + s->cmdline_idx = 0; s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); + memset(s->saved_tgids, 0, + val * sizeof(*s->saved_tgids)); return 0; } @@ -1596,7 +1607,7 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - saved_tgids[idx] = tsk->tgid; + savedcmd->saved_tgids[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); return 1; @@ -1648,7 +1659,7 @@ int trace_find_tgid(int pid) arch_spin_lock(&trace_cmdline_lock); map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - tgid = saved_tgids[map]; + tgid = savedcmd->saved_tgids[map]; else tgid = -1; @@ -3980,6 +3991,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); + kfree(s->saved_tgids); kfree(s); } @@ -4221,13 +4233,13 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf, int pid; int i; - file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL); + file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL); if (!file_buf) return -ENOMEM; buf = file_buf; - for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) { + for (i = 0; i < savedcmd->cmdline_num; i++) { int tgid; int r; @@ -4822,19 +4834,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) @@ -5864,9 +5877,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -5876,6 +5886,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -5933,19 +5946,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f2813e137b23..91fa701b4a24 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,6 +27,7 @@ #include <linux/kvm_para.h> #include <linux/perf_event.h> #include <linux/kthread.h> +#include <soc/qcom/watchdog.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -366,8 +367,11 @@ static void watchdog_check_hardlockup_other_cpu(void) if (per_cpu(hard_watchdog_warn, next_cpu) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + if (hardlockup_panic) { + pr_err("Watchdog detected hard LOCKUP on cpu %u", + next_cpu); + msm_trigger_wdog_bite(); + } else WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); @@ -430,6 +434,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + if (hardlockup_panic) + msm_trigger_wdog_bite(); + print_modules(); print_irqtrace_events(current); if (regs) @@ -552,6 +559,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + + if (softlockup_panic) + msm_trigger_wdog_bite(); __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); |
