diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/inode.c | 7 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 24 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 66 | ||||
| -rw-r--r-- | kernel/cgroup.c | 14 | ||||
| -rw-r--r-- | kernel/cpuset.c | 4 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 10 | ||||
| -rw-r--r-- | kernel/futex.c | 27 | ||||
| -rw-r--r-- | kernel/locking/mcs_spinlock.h | 8 | ||||
| -rw-r--r-- | kernel/sched/core.c | 35 | ||||
| -rw-r--r-- | kernel/trace/trace_events.c | 9 | ||||
| -rw-r--r-- | kernel/workqueue.c | 40 |
11 files changed, 69 insertions, 175 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d1a7646f79c5..5a8a797d50b7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - raw = bpf_prog_inc(raw); + atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); break; case BPF_TYPE_MAP: - raw = bpf_map_inc(raw, true); + bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -277,8 +277,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - if (!IS_ERR(raw)) - touch_atime(&path); + touch_atime(&path); path_put(&path); return raw; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e32cc94edd9..3b39550d8485 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,18 +181,11 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -/* prog's and map's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) +void bpf_map_inc(struct bpf_map *map, bool uref) { - if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&map->refcnt); - return ERR_PTR(-EBUSY); - } + atomic_inc(&map->refcnt); if (uref) atomic_inc(&map->usercnt); - return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -204,7 +197,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - map = bpf_map_inc(map, true); + bpf_map_inc(map, true); fdput(f); return map; @@ -587,15 +580,6 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) -{ - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); - return ERR_PTR(-EBUSY); - } - return prog; -} - /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -608,7 +592,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - prog = bpf_prog_inc(prog); + atomic_inc(&prog->aux->refcnt); fdput(f); return prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cbfba78d3db..2e7f7ab739e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -239,6 +239,15 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -889,44 +898,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { + bool bool_map, bool_func; + int i; + if (!map) return 0; - /* We need a two way check, first is from map perspective ... */ - switch (map->map_type) { - case BPF_MAP_TYPE_PROG_ARRAY: - if (func_id != BPF_FUNC_tail_call) - goto error; - break; - case BPF_MAP_TYPE_PERF_EVENT_ARRAY: - if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) - goto error; - break; - default: - break; - } - - /* ... and second from the function itself. */ - switch (func_id) { - case BPF_FUNC_tail_call: - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) - goto error; - break; - case BPF_FUNC_perf_event_read: - case BPF_FUNC_perf_event_output: - if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) - goto error; - break; - default: - break; + for (i = 0; i < ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_func && bool_map != bool_func) + return -EINVAL; } return 0; -error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); - return -EINVAL; } static int check_call(struct verifier_env *env, int func_id) @@ -1359,7 +1348,6 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || - BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { verbose("BPF_LD_ABS uses reserved fields\n"); return -EINVAL; @@ -2015,6 +2003,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); + fdput(f); return PTR_ERR(map); } @@ -2034,18 +2023,15 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) { - fdput(f); - return PTR_ERR(map); - } - env->used_maps[env->used_map_cnt++] = map; - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9d03abef6676..e8d71110ed2a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2776,10 +2776,9 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; - struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ssid, ret; + int ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2827,10 +2826,8 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -4747,15 +4744,14 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ - struct cgroup_subsys_state *parent = css->parent; int id = css->id; + if (css->parent) + css_put(css->parent); + ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); - - if (parent) - css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a65d63463420..2df78d45a096 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -57,6 +57,7 @@ #include <asm/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/wait.h> @@ -1014,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +void cpuset_post_attach_flush(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2100,7 +2101,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = 1, diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 014b69528194..adfdc0536117 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -347,7 +347,6 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; - bool wakeup = truncated; unsigned long aux_head; u64 flags = 0; @@ -376,16 +375,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { - wakeup = true; - local_add(rb->aux_watermark, &rb->aux_wakeup); - } - - if (wakeup) { - if (truncated) - handle->event->pending_disable = 1; perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); } - handle->event = NULL; local_set(&rb->aux_nest, 0); diff --git a/kernel/futex.c b/kernel/futex.c index 9d8163afd87c..461c72b2dac2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1244,20 +1244,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; - } else if (curval != uval) { - /* - * If a unconditional UNLOCK_PI operation (user space did not - * try the TID->0 transition) raced with a waiter setting the - * FUTEX_WAITERS flag between get_user() and locking the hash - * bucket lock, retry the operation. - */ - if ((FUTEX_TID_MASK & curval) == uval) - ret = -EAGAIN; - else - ret = -EINVAL; - } + else if (curval != uval) + ret = -EINVAL; if (ret) { raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; @@ -1484,8 +1474,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -2548,15 +2538,6 @@ retry: if (ret == -EFAULT) goto pi_faulted; /* - * A unconditional UNLOCK_PI op raced against a waiter - * setting the FUTEX_WAITERS bit. Try again. - */ - if (ret == -EAGAIN) { - spin_unlock(&hb->lock); - put_futex_key(&key); - goto retry; - } - /* * wake_futex_pi has detected invalid state. Tell user * space. */ diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c835270f0c2f..5b9102a47ea5 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,13 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - /* - * We rely on the full barrier with global transitivity implied by the - * below xchg() to order the initialization stores above against any - * observation of @node. And to provide the ACQUIRE ordering associated - * with a LOCK primitive. - */ - prev = xchg(lock, node); + prev = xchg_acquire(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25afcb8a1402..db0472b37feb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11223,7 +11223,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void sched_free_group(struct task_group *tg) +static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -11249,7 +11249,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - sched_free_group(tg); + free_sched_group(tg); return ERR_PTR(-ENOMEM); } @@ -11269,16 +11269,17 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) +static void free_sched_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - sched_free_group(container_of(rhp, struct task_group, rcu)); + free_sched_group(container_of(rhp, struct task_group, rcu)); } +/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, sched_free_group_rcu); + call_rcu(&tg->rcu, free_sched_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -11739,26 +11740,31 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } -static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); - sched_offline_group(tg); + if (parent) + sched_online_group(tg, parent); + return 0; } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - /* - * Relies on the RCU grace period between css_released() and this. - */ - sched_free_group(tg); + sched_destroy_group(tg); +} + +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + sched_offline_group(tg); } static void cpu_cgroup_fork(struct task_struct *task, void *private) @@ -12187,8 +12193,9 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, - .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 26960e49bb8c..fda3b6e1b3a0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2108,13 +2108,8 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - /* - * Only event directories that can be enabled should have - * triggers. - */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 316b316c7528..ef84d9874d03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -652,35 +652,6 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); - /* - * The following mb guarantees that previous clear of a PENDING bit - * will not be reordered with any speculative LOADS or STORES from - * work->current_func, which is executed afterwards. This possible - * reordering can lead to a missed execution on attempt to qeueue - * the same @work. E.g. consider this case: - * - * CPU#0 CPU#1 - * ---------------------------- -------------------------------- - * - * 1 STORE event_indicated - * 2 queue_work_on() { - * 3 test_and_set_bit(PENDING) - * 4 } set_..._and_clear_pending() { - * 5 set_work_data() # clear bit - * 6 smp_mb() - * 7 work->current_func() { - * 8 LOAD event_indicated - * } - * - * Without an explicit full barrier speculative LOAD on line 8 can - * be executed before CPU#0 does STORE on line 1. If that happens, - * CPU#0 observes the PENDING bit is still set and new execution of - * a @work is not queued in a hope, that CPU#1 will eventually - * finish the queued @work. Meanwhile CPU#1 does not see - * event_indicated is set, because speculative LOAD was executed - * before actual STORE. - */ - smp_mb(); } static void clear_work_data(struct work_struct *work) @@ -4476,17 +4447,6 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); - - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { |
