diff options
Diffstat (limited to 'kernel')
54 files changed, 978 insertions, 364 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d50b7..cb85d228b1ac 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -277,7 +277,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; @@ -357,7 +358,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { @@ -365,7 +366,6 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; MODULE_ALIAS_FS("bpf"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b39550d8485..4e32cc94edd9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,11 +181,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -197,7 +204,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -580,6 +587,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -592,7 +608,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7945d10b378..2cbfba78d3db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -239,15 +239,6 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; -static const struct { - int map_type; - int func_id; -} func_limit[] = { - {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, -}; - static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -898,24 +889,44 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { - bool bool_map, bool_func; - int i; - if (!map) return 0; - for (i = 0; i < ARRAY_SIZE(func_limit); i++) { - bool_map = (map->map_type == func_limit[i].map_type); - bool_func = (func_id == func_limit[i].func_id); - /* only when map & func pair match it can continue. - * don't allow any other map type to be passed into - * the special func; - */ - if (bool_func && bool_map != bool_func) - return -EINVAL; + /* We need a two way check, first is from map perspective ... */ + switch (map->map_type) { + case BPF_MAP_TYPE_PROG_ARRAY: + if (func_id != BPF_FUNC_tail_call) + goto error; + break; + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + if (func_id != BPF_FUNC_perf_event_read && + func_id != BPF_FUNC_perf_event_output) + goto error; + break; + default: + break; + } + + /* ... and second from the function itself. */ + switch (func_id) { + case BPF_FUNC_tail_call: + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + goto error; + break; + case BPF_FUNC_perf_event_read: + case BPF_FUNC_perf_event_output: + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + goto error; + break; + default: + break; } return 0; +error: + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); + return -EINVAL; } static int check_call(struct verifier_env *env, int func_id) @@ -1121,6 +1132,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && @@ -1338,6 +1359,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { verbose("BPF_LD_ABS uses reserved fields\n"); return -EINVAL; @@ -1993,7 +2015,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } @@ -2013,15 +2034,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; @@ -2072,7 +2096,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ + int capable; + + if (unlikely(!cap_valid(cap))) { + pr_crit("capable() called with invalid cap=%u\n", cap); + BUG(); + } + + capable = audit ? security_capable(current_cred(), ns, cap) : + security_capable_noaudit(current_cred(), ns, cap); + if (capable == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} + /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) */ bool ns_capable(struct user_namespace *ns, int cap) { - if (unlikely(!cap_valid(cap))) { - pr_crit("capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; + return ns_capable_common(ns, cap, true); } EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit); /** * capable - Determine if the current task has a superior capability in effect diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bd541053efc2..f53e61f95b55 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,7 +57,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - +#include <linux/cpuset.h> #include <linux/atomic.h> /* @@ -2498,6 +2498,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) @@ -2768,9 +2776,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2818,6 +2827,9 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -4735,14 +4747,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4835,9 +4848,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4860,6 +4875,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -5091,10 +5110,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5103,8 +5127,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** @@ -5173,6 +5199,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5193,11 +5220,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_bh(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_bh(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3b4f27981778..e2e294d997e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -57,7 +57,6 @@ #include <asm/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> -#include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/wait.h> @@ -287,6 +286,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -974,31 +975,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; + struct cpuset_migrate_mm_work *mwork; - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +static void cpuset_post_attach(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1099,7 +1120,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1544,11 +1566,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1713,6 +1735,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2087,6 +2110,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = 1, @@ -2386,6 +2410,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..ff8606f77d90 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f07e9c34f43..cf9dfbc15403 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,6 +951,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: + * cred_guard_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event_context::lock @@ -1585,14 +1586,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -3423,7 +3424,6 @@ static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; - int err; rcu_read_lock(); if (!vpid) @@ -3437,16 +3437,7 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto errout; - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - } /* @@ -7115,7 +7106,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); } } @@ -7984,6 +7975,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + /* symmetric to unaccount_event() in _free_event() */ + account_event(event); + return event; err_per_task: @@ -8333,6 +8327,24 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (task) { + err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (err) + goto err_cpus; + + /* + * Reuse ptrace permission checks for now. + * + * We must hold cred_guard_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -8340,7 +8352,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cpus; + goto err_cred; } if (is_sampling_event(event)) { @@ -8350,8 +8362,6 @@ SYSCALL_DEFINE5(perf_event_open, } } - account_event(event); - /* * Special case software events and allow them to be part of * any hardware group. @@ -8401,11 +8411,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - if (task) { - put_task_struct(task); - task = NULL; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -8493,6 +8498,11 @@ SYSCALL_DEFINE5(perf_event_open, WARN_ON_ONCE(ctx->parent_ctx); + /* + * This is the point on no return; we cannot fail hereafter. This is + * where we start modifying current state. + */ + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details @@ -8562,6 +8572,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&gctx->mutex); mutex_unlock(&ctx->mutex); + if (task) { + mutex_unlock(&task->signal->cred_guard_mutex); + put_task_struct(task); + } + put_online_cpus(); event->owner = current; @@ -8590,7 +8605,15 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); +err_cred: + if (task) + mutex_unlock(&task->signal->cred_guard_mutex); err_cpus: put_online_cpus(); err_task: @@ -8634,8 +8657,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = EVENT_OWNER_KERNEL; - account_event(event); - ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -8872,6 +8893,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. + * + * Can be called with cred_guard_mutex held when called from + * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..014b69528194 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -347,6 +347,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; + bool wakeup = truncated; unsigned long aux_head; u64 flags = 0; @@ -375,9 +376,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { - perf_output_wakeup(handle); + wakeup = true; local_add(rb->aux_watermark, &rb->aux_wakeup); } + + if (wakeup) { + if (truncated) + handle->event->pending_disable = 1; + perf_output_wakeup(handle); + } + handle->event = NULL; local_set(&rb->aux_nest, 0); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..da0c09ff6112 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -171,8 +171,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -199,7 +201,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/exit.c b/kernel/exit.c index 92ff63200287..62c4bd4abd3a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -923,17 +923,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1306,7 +1317,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; diff --git a/kernel/futex.c b/kernel/futex.c index 495a1d06915b..9d8163afd87c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1244,10 +1244,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } if (ret) { raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; @@ -1474,8 +1484,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -2538,6 +2548,15 @@ retry: if (ret == -EFAULT) goto pi_faulted; /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } + /* * wake_futex_pi has detected invalid state. Tell user * space. */ @@ -2755,6 +2774,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + free_pi_state(q.pi_state); spin_unlock(q.lock_ptr); } } else { diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..57bff7857e87 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) unsigned int flags = 0, irq = desc->irq_data.irq; struct irqaction *action = desc->action; - do { + /* action might have become NULL since we dropped the lock */ + while (action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a2a1..4b21779d5163 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = ops->msi_check(domain, info, dev); if (ret == 0) @@ -278,12 +278,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false); if (virq < 0) { ret = -ENOSPC; @@ -307,6 +303,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..89350f924c85 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 87e9ce6a63c5..8173bc7fec92 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -255,6 +255,66 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 + * + * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 + * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ + u32 val; + + for (;;) { + val = atomic_read(&lock->val); + + if (!val) /* not locked, we're done */ + goto done; + + if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ + break; + + /* not locked, but pending, wait until we observe the lock */ + cpu_relax(); + } + + /* any unlock is good */ + while (atomic_read(&lock->val) & _Q_LOCKED_MASK) + cpu_relax(); + +done: + smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif + #endif /* _GEN_PV_LOCK_SLOWPATH */ /** diff --git a/kernel/memremap.c b/kernel/memremap.c index 7658d32c5c78..25ced161ebeb 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -133,8 +133,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } diff --git a/kernel/module.c b/kernel/module.c index 38c7bd5583ff..b14a4f31221f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -327,6 +327,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -2492,10 +2495,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_size = ALIGN(mod->init_size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_size; + mod->init_size += sizeof(struct mod_kallsyms); + mod->init_size = debug_align(mod->init_size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2504,28 +2518,33 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs; + mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2587,13 +2606,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2612,7 +2636,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -3274,9 +3298,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); unset_module_init_ro_nx(mod); @@ -3426,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; @@ -3515,7 +3538,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); @@ -3646,6 +3669,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3653,6 +3681,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3662,32 +3691,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3777,19 +3806,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3798,11 +3829,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3841,11 +3873,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/panic.c b/kernel/panic.c index 4b150bc0c6c1..41e2b54f36b5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -157,8 +157,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_trylock(); - console_unlock(); + console_flush_on_panic(); if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..b7dd5718836e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 28fb44dccbad..e7e586bb2022 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2241,13 +2241,24 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2319,6 +2330,9 @@ skip: call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); + + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2386,6 +2400,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..249b1eb1e6e1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c90bfb620981..778335a7140c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -640,7 +640,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@ -5117,14 +5120,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif @@ -5759,6 +5764,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) walt_set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: @@ -7053,7 +7059,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -8028,7 +8034,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -8054,7 +8060,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -8074,17 +8080,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -8545,31 +8550,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task, void *private) @@ -8929,9 +8929,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 442a9f7a2832..acde1d7c763c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -275,21 +275,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -616,19 +616,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -651,7 +657,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e42cd11e430..14c06f92184c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -703,8 +703,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -1209,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1278,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1375,6 +1381,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1400,9 +1407,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2699,6 +2713,23 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { @@ -2707,15 +2738,15 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2785,10 +2816,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); } /* Add the load generated by se into cfs_rq's load average */ @@ -4711,19 +4742,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ef7159012cf3..b0b93fd33af9 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 06f15724a639..2f2b959ad244 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2025,3 +2025,16 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..15a1795bbba1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } diff --git a/kernel/sys.c b/kernel/sys.c index 0c00e563b153..ba3ddb43dd9f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1855,11 +1855,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1896,10 +1898,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -1965,7 +1965,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); prctl_map.start_code = mm->start_code; @@ -2058,7 +2058,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f3300de62fe4..d964422eb601 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1839,6 +1839,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1347882d131e..b98810d2f3b4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c98439fdce81..405536b22c0c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -94,6 +94,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, @@ -102,7 +105,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - return hrtimer_clock_to_base_table[clock_id]; + int base = hrtimer_clock_to_base_table[clock_id]; + BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); + return base; } /* @@ -897,10 +902,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +935,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +959,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -974,19 +995,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1086,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1220,6 +1236,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, fn = timer->function; /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock * the timer base. diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 149cc8086aea..ab861771e37f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -674,8 +674,24 @@ int ntp_validate_timex(struct timex *txc) return -EINVAL; } - if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) - return -EPERM; + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } /* * Check for potential multiplication overflows that can diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ce033c7aa2e8..9cff0ab82b63 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - int result = 0; + unsigned int result = 0; if (!clk) - return -ENODEV; + return POLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..80016b329d94 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -808,6 +808,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7c7ec4515983..22c57e191a23 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -977,9 +977,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d563c1960302..4ff237dbc006 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -305,8 +305,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) delta = timekeeping_get_delta(tkr); - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; + nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -384,7 +383,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; @@ -959,7 +961,7 @@ int timekeeping_inject_offset(struct timespec *ts) struct timespec64 ts64, tmp; int ret = 0; - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + if (!timespec_inject_offset_valid(ts)) return -EINVAL; ts64 = timespec_to_timespec64(*ts); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27ba3..acbb0e73d3a2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -437,7 +437,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; - unsigned int nr_pages; + unsigned long nr_pages; unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -458,7 +458,7 @@ struct ring_buffer_per_cpu { u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ - int nr_pages_to_update; + long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; @@ -1137,10 +1137,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { - int i; struct buffer_page *bpage, *tmp; + long i; for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1177,7 +1177,7 @@ free_pages: } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) + unsigned long nr_pages) { LIST_HEAD(pages); @@ -1202,7 +1202,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1302,8 +1302,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct ring_buffer *buffer; + long nr_pages; int bsize; - int cpu, nr_pages; + int cpu; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1429,12 +1430,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) } static int -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; - unsigned int nr_removed; + unsigned long nr_removed; unsigned long head_bit; int page_entries; @@ -1651,7 +1652,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages; + unsigned long nr_pages; int cpu, err = 0; /* @@ -1665,14 +1666,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be @@ -4645,8 +4645,9 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; - int cpu_i, nr_pages_same; - unsigned int nr_pages; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; switch (action) { case CPU_UP_PREPARE: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f375d4c05fb..dd4d86ae8e21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1771,7 +1771,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -5038,7 +5038,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4f6ef6912e00..996f0fd34312 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } @@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } @@ -2104,8 +2107,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + /* + * Only event directories that can be enabled should have + * triggers. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219b18da..6816302542b2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..be3222b7d72e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 060df67dbdd1..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } @@ -296,6 +301,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e6742950..202df6cffcca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack) } /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + + /* * Now find where in the stack these are. */ x = 0; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e864906af3fc..1f1b05f5a94b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1020,6 +1020,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -1064,7 +1067,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -1084,6 +1087,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c579dbab2e36..2c2f971f3e75 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -568,6 +568,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -639,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) @@ -1458,13 +1497,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -4418,6 +4457,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { |