diff options
Diffstat (limited to 'kernel')
49 files changed, 1205 insertions, 483 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index d440c25cb3be..f94484ea1b26 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -753,13 +753,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -815,6 +813,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -838,6 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -859,7 +859,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -914,7 +914,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -923,9 +925,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; + /* exit early if there isn't at least one character to print */ + if (data_len < 2) + return -EINVAL; err = audit_filter_user(msg_type); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push_current(); @@ -934,19 +941,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } mutex_unlock(&audit_cmd_mutex); audit_log_common_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_set_portid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); @@ -955,7 +960,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); @@ -964,7 +969,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; } err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh)); + seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -978,7 +983,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1055,7 +1060,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index cf7aa656b308..41a668a9d561 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -434,6 +434,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -442,12 +443,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -463,7 +464,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -472,11 +473,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -489,11 +491,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -502,68 +506,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 04fc1022ad9f..01431ef8cf07 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -152,7 +152,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -162,6 +162,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } @@ -667,7 +669,7 @@ static int bpf_obj_get(const union bpf_attr *attr) SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -703,6 +705,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz } /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index b50d5a167fda..ea8cb03dbf72 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -48,7 +48,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -70,8 +70,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); return &pids->css; } @@ -142,13 +142,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -262,7 +263,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -270,7 +271,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); diff --git a/kernel/cpu.c b/kernel/cpu.c index 98791b70277a..223f8f208df1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -648,7 +648,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - pr_info("Disabling non-boot CPUs ...\n"); + pr_debug("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -698,7 +698,7 @@ void enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - pr_info("Enabling non-boot CPUs ...\n"); + pr_debug("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); @@ -707,7 +707,7 @@ void enable_nonboot_cpus(void) error = _cpu_up(cpu, 1); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { - pr_info("CPU%d is up\n", cpu); + pr_debug("CPU%d is up\n", cpu); cpu_device = get_cpu_device(cpu); if (!cpu_device) pr_err("%s: failed to get cpu%d device\n", diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ebc52c7bd8a6..cba287a5c976 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2632,7 +2632,7 @@ static int kdb_per_cpu(int argc, const char **argv) diag = kdbgetularg(argv[3], &whichcpu); if (diag) return diag; - if (!cpu_online(whichcpu)) { + if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { kdb_printf("cpu %ld is not online\n", whichcpu); return KDB_BADCPUNUM; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d065a82610d..40d738294f0c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5003,7 +5003,15 @@ accounting: */ user_lock_limit *= num_online_cpus(); - user_locked = atomic_long_read(&user->locked_vm) + user_extra; + user_locked = atomic_long_read(&user->locked_vm); + + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += user_extra; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -5992,10 +6000,17 @@ static void perf_event_task_output(struct perf_event *event, goto out; task_event->event_id.pid = perf_event_pid(event, task); - task_event->event_id.ppid = perf_event_pid(event, current); - task_event->event_id.tid = perf_event_tid(event, task); - task_event->event_id.ptid = perf_event_tid(event, current); + + if (task_event->event_id.header.type == PERF_RECORD_EXIT) { + task_event->event_id.ppid = perf_event_pid(event, + task->real_parent); + task_event->event_id.ptid = perf_event_pid(event, + task->real_parent); + } else { /* PERF_RECORD_FORK */ + task_event->event_id.ppid = perf_event_pid(event, current); + task_event->event_id.ptid = perf_event_tid(event, current); + } task_event->event_id.time = perf_event_clock(event); diff --git a/kernel/futex.c b/kernel/futex.c index d24d164e9bdb..e3ef6934b37f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -378,9 +378,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } @@ -407,7 +407,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies MB (B) */ @@ -438,7 +438,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -446,6 +445,46 @@ static void drop_futex_key_refs(union futex_key *key) } } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -458,9 +497,15 @@ static void drop_futex_key_refs(union futex_key *key) * * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -628,8 +673,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -661,40 +704,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page_head); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(page); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page_head); return err; @@ -1462,8 +1479,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { - if (oparg < 0 || oparg > 31) - return -EINVAL; + if (oparg < 0 || oparg > 31) { + char comm[sizeof(current->comm)]; + /* + * kill this print and return -EINVAL when userspace + * is sane again + */ + pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", + get_task_comm(comm, current), oparg); + oparg &= 31; + } oparg = 1 << oparg; } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index edf67c493a8e..e473f6a1f6ca 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos) { struct gcov_iterator *iter = data; + (*pos)++; if (gcov_iter_next(iter)) return NULL; - (*pos)++; return iter; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5a2932713d0c..f4c99c41fee2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -33,6 +33,7 @@ static irqreturn_t bad_chained_irq(int irq, void *dev_id) */ struct irqaction chained_action = { .handler = bad_chained_irq, + .name = "chained-irq", }; /** @@ -327,11 +328,12 @@ void unmask_threaded_irq(struct irq_desc *desc) * handler. The handler function is called inside the calling * threads context. */ -void handle_nested_irq(unsigned int irq) +bool handle_nested_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; irqreturn_t action_ret; + bool handled = false; might_sleep(); @@ -356,8 +358,11 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + handled = true; + out_unlock: raw_spin_unlock_irq(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_nested_irq); @@ -404,8 +409,10 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void handle_simple_irq(struct irq_desc *desc) +bool handle_simple_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @@ -421,8 +428,11 @@ void handle_simple_irq(struct irq_desc *desc) kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); + handled = true; + out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_simple_irq); @@ -453,8 +463,10 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void handle_level_irq(struct irq_desc *desc) +bool handle_level_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -477,8 +489,11 @@ void handle_level_irq(struct irq_desc *desc) cond_unmask_irq(desc); + handled = true; + out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -522,9 +537,10 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void handle_fasteoi_irq(struct irq_desc *desc) +bool handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; + bool handled = false; raw_spin_lock(&desc->lock); @@ -552,12 +568,15 @@ void handle_fasteoi_irq(struct irq_desc *desc) cond_unmask_eoi_irq(desc, chip); + handled = true; + raw_spin_unlock(&desc->lock); - return; + return handled; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); @@ -576,8 +595,10 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void handle_edge_irq(struct irq_desc *desc) +bool handle_edge_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); @@ -621,12 +642,14 @@ void handle_edge_irq(struct irq_desc *desc) } handle_irq_event(desc); + handled = true; } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL(handle_edge_irq); @@ -638,8 +661,9 @@ EXPORT_SYMBOL(handle_edge_irq); * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(struct irq_desc *desc) +bool handle_edge_eoi_irq(struct irq_desc *desc) { + bool handled = false; struct irq_chip *chip = irq_desc_get_chip(desc); raw_spin_lock(&desc->lock); @@ -667,6 +691,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc) goto out_eoi; handle_irq_event(desc); + handled = true; } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); @@ -674,6 +699,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc) out_eoi: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); + return handled; } #endif @@ -683,7 +709,7 @@ out_eoi: * * Per CPU interrupts on SMP machines without locking requirements */ -void handle_percpu_irq(struct irq_desc *desc) +bool handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -696,6 +722,8 @@ void handle_percpu_irq(struct irq_desc *desc) if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); + + return true; } /** @@ -709,7 +737,7 @@ void handle_percpu_irq(struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(struct irq_desc *desc) +bool handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; @@ -728,6 +756,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); + + return true; } void diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..80e76ddbf804 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -26,13 +26,14 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(struct irq_desc *desc) +bool handle_bad_irq(struct irq_desc *desc) { unsigned int irq = irq_desc_get_irq(desc); print_irq_desc(irq, desc); kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); + return true; } EXPORT_SYMBOL_GPL(handle_bad_irq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 239e2ae2c947..52fbf88cd2d8 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/bitmap.h> #include <linux/irqdomain.h> +#include <linux/wakeup_reason.h> #include "internals.h" @@ -339,16 +340,25 @@ void irq_init_desc(unsigned int irq) /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle - * + * returns: + * negative on error + * 0 when the interrupt handler was not called + * 1 when the interrupt handler was called */ + int generic_handle_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; - generic_handle_irq_desc(desc); - return 0; + + if (unlikely(logging_wakeup_reasons_nosync())) + return log_possible_wakeup_reason(irq, + desc, + generic_handle_irq_desc); + + return generic_handle_irq_desc(desc); } EXPORT_SYMBOL_GPL(generic_handle_irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d0193c0b2531..4746500d65ec 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -235,7 +235,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -335,7 +339,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 37ddb7bda651..ec7c7eda0774 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,17 +7,18 @@ void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_data *data = &desc->irq_data; + struct irq_chip *chip = data->chip; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(data))) return; - irqd_clr_move_pending(&desc->irq_data); + irqd_clr_move_pending(data); /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (irqd_is_per_cpu(&desc->irq_data)) { + if (irqd_is_per_cpu(data)) { WARN_ON(1); return; } @@ -42,9 +43,20 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) - irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); - + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + int ret; + + ret = irq_do_set_affinity(data, desc->pending_mask, false); + /* + * If the there is a cleanup pending in the underlying + * vector management, reschedule the move for the next + * interrupt. Leave desc->pending_mask intact. + */ + if (ret == -EBUSY) { + irqd_set_move_pending(data); + return; + } + } cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..28e134310435 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -11,7 +11,7 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> - +#include <linux/wakeup_reason.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d1216f80..e4e5e98002fe 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -119,7 +119,7 @@ out: * invoke it. * * If module auto-loading support is disabled then this function - * becomes a no-operation. + * simply returns -ENOENT. */ int __request_module(bool wait, const char *fmt, ...) { @@ -140,7 +140,7 @@ int __request_module(bool wait, const char *fmt, ...) WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0]) - return 0; + return -ENOENT; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a419696709a1..0a00720d3ccc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1265,9 +1265,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); + current->lockdep_recursion = 1; arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); arch_spin_unlock(&lockdep_lock); + current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; @@ -1292,9 +1294,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); + current->lockdep_recursion = 1; arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); arch_spin_unlock(&lockdep_lock); + current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index d580b7d6ee6d..ad5aea269f76 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -655,10 +655,10 @@ static void __torture_print_stats(char *page, if (statp[i].n_lock_fail) fail = true; sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + if (max < statp[i].n_lock_acquired) + max = statp[i].n_lock_acquired; + if (min > statp[i].n_lock_acquired) + min = statp[i].n_lock_acquired; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index d381f559e0ce..989991f00dc7 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -53,19 +53,19 @@ EXPORT_SYMBOL(__rwlock_init); static void spin_dump(raw_spinlock_t *lock, const char *msg) { - struct task_struct *owner = NULL; + struct task_struct *owner = READ_ONCE(lock->owner); - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; + if (owner == SPINLOCK_OWNER_INIT) + owner = NULL; printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current)); printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " ".owner_cpu: %d\n", - lock, lock->magic, + lock, READ_ONCE(lock->magic), owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); + READ_ONCE(lock->owner_cpu)); #ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG msm_trigger_wdog_bite(); #elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG) @@ -87,16 +87,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg) static inline void debug_spin_lock_before(raw_spinlock_t *lock) { - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } static inline void debug_spin_lock_after(raw_spinlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_spin_unlock(raw_spinlock_t *lock) @@ -106,8 +106,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } static void __spin_lock_debug(raw_spinlock_t *lock) @@ -245,8 +245,8 @@ static inline void debug_write_lock_before(rwlock_t *lock) static inline void debug_write_lock_after(rwlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_write_unlock(rwlock_t *lock) @@ -255,8 +255,8 @@ static inline void debug_write_unlock(rwlock_t *lock) RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } #if 0 /* This can cause lockups */ diff --git a/kernel/module.c b/kernel/module.c index ad4928210e28..3b7aac2ea1e5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1014,6 +1014,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); free_module(mod); + /* someone could wait for the module in add_unformed_module() */ + wake_up_all(&module_wq); return 0; out: mutex_unlock(&module_mutex); diff --git a/kernel/notifier.c b/kernel/notifier.c index fd2c9acbcc19..0f70f1b6fdaa 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -552,7 +552,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/padata.c b/kernel/padata.c index 282b489a286d..c50975f43b34 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -33,6 +33,8 @@ #define MAX_OBJ_NUM 1000 +static void padata_free_pd(struct parallel_data *pd); + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; @@ -63,15 +65,11 @@ static int padata_cpu_hash(struct parallel_data *pd) static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); pqueue = container_of(parallel_work, struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; spin_lock(&pqueue->parallel.lock); list_replace_init(&pqueue->parallel.list, &local_list); @@ -134,6 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->cb_cpu = cb_cpu; target_cpu = padata_cpu_hash(pd); + padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -157,8 +156,6 @@ EXPORT_SYMBOL(padata_do_parallel); * A pointer to the control struct of the next object that needs * serialization, if present in one of the percpu reorder queues. * - * NULL, if all percpu reorder queues are empty. - * * -EINPROGRESS, if the next object that needs serialization will * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. @@ -168,25 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus; - unsigned int next_nr, next_index; struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; + int cpu = pd->cpu; - num_cpus = cpumask_weight(pd->cpumask.pcpu); - - /* - * Calculate the percpu reorder queue and the sequence - * number of the next object. - */ - next_nr = pd->processed; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - - padata = NULL; - reorder = &next_queue->reorder; spin_lock(&reorder->lock); @@ -197,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - pd->processed++; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, + false); spin_unlock(&reorder->lock); goto out; @@ -220,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd) struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; + struct padata_parallel_queue *next_queue; /* * We need to ensure that only one cpu can work on dequeueing of @@ -238,12 +224,11 @@ static void padata_reorder(struct parallel_data *pd) padata = padata_get_next(pd); /* - * All reorder queues are empty, or the next object that needs - * serialization is parallel processed by another cpu and is - * still on it's way to the cpu's reorder queue, nothing to - * do for now. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, nothing to do for now. */ - if (!padata || PTR_ERR(padata) == -EINPROGRESS) + if (PTR_ERR(padata) == -EINPROGRESS) break; /* @@ -252,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd) * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { - del_timer(&pd->timer); spin_unlock_bh(&pd->lock); return; } @@ -271,28 +255,27 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to - * the reorder queues in the meantime, we will be called again - * from the timer function if no one else cares for it. + * the reorder queues in the meantime. * - * Ensure reorder_objects is read after pd->lock is dropped so we see - * an increment from another task in padata_do_serial. Pairs with + * Ensure reorder queue is read after pd->lock is dropped so we see + * new objects from another task in padata_do_serial. Pairs with * smp_mb__after_atomic in padata_do_serial. */ smp_mb(); - if (atomic_read(&pd->reorder_objects) - && !(pinst->flags & PADATA_RESET)) - mod_timer(&pd->timer, jiffies + HZ); - else - del_timer(&pd->timer); - return; + next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); + if (!list_empty(&next_queue->reorder.list)) + queue_work(pinst->wq, &pd->reorder_work); } -static void padata_reorder_timer(unsigned long arg) +static void invoke_padata_reorder(struct work_struct *work) { - struct parallel_data *pd = (struct parallel_data *)arg; + struct parallel_data *pd; + local_bh_disable(); + pd = container_of(work, struct parallel_data, reorder_work); padata_reorder(pd); + local_bh_enable(); } static void padata_serial_worker(struct work_struct *serial_work) @@ -300,6 +283,7 @@ static void padata_serial_worker(struct work_struct *serial_work) struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); + int cnt; local_bh_disable(); squeue = container_of(serial_work, struct padata_serial_queue, work); @@ -309,6 +293,8 @@ static void padata_serial_worker(struct work_struct *serial_work) list_replace_init(&squeue->serial.list, &local_list); spin_unlock(&squeue->serial.lock); + cnt = 0; + while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -318,9 +304,12 @@ static void padata_serial_worker(struct work_struct *serial_work) list_del_init(&padata->list); padata->serial(padata); - atomic_dec(&pd->refcnt); + cnt++; } local_bh_enable(); + + if (atomic_sub_and_test(cnt, &pd->refcnt)) + padata_free_pd(pd); } /** @@ -333,29 +322,22 @@ static void padata_serial_worker(struct work_struct *serial_work) */ void padata_do_serial(struct padata_priv *padata) { - int cpu; - struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - - pd = padata->pd; - - cpu = get_cpu(); - pqueue = per_cpu_ptr(pd->pqueue, cpu); + struct parallel_data *pd = padata->pd; + struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, + padata->cpu); spin_lock(&pqueue->reorder.lock); - atomic_inc(&pd->reorder_objects); list_add_tail(&padata->list, &pqueue->reorder.list); + atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* - * Ensure the atomic_inc of reorder_objects above is ordered correctly + * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ smp_mb__after_atomic(); - put_cpu(); - padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); @@ -404,9 +386,14 @@ static void padata_init_pqueues(struct parallel_data *pd) struct padata_parallel_queue *pqueue; cpu_index = 0; - for_each_cpu(cpu, pd->cpumask.pcpu) { + for_each_possible_cpu(cpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); - pqueue->pd = pd; + + if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { + pqueue->cpu_index = -1; + continue; + } + pqueue->cpu_index = cpu_index; cpu_index++; @@ -440,12 +427,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); - atomic_set(&pd->refcnt, 0); + atomic_set(&pd->refcnt, 1); pd->pinst = pinst; spin_lock_init(&pd->lock); + pd->cpu = cpumask_first(pd->cpumask.pcpu); + INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -468,31 +456,6 @@ static void padata_free_pd(struct parallel_data *pd) kfree(pd); } -/* Flush all objects out of the padata queues. */ -static void padata_flush_queues(struct parallel_data *pd) -{ - int cpu; - struct padata_parallel_queue *pqueue; - struct padata_serial_queue *squeue; - - for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - flush_work(&pqueue->work); - } - - del_timer_sync(&pd->timer); - - if (atomic_read(&pd->reorder_objects)) - padata_reorder(pd); - - for_each_cpu(cpu, pd->cpumask.cbcpu) { - squeue = per_cpu_ptr(pd->squeue, cpu); - flush_work(&squeue->work); - } - - BUG_ON(atomic_read(&pd->refcnt) != 0); -} - static void __padata_start(struct padata_instance *pinst) { pinst->flags |= PADATA_INIT; @@ -506,10 +469,6 @@ static void __padata_stop(struct padata_instance *pinst) pinst->flags &= ~PADATA_INIT; synchronize_rcu(); - - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); } /* Replace the internal control structure with a new one. */ @@ -530,8 +489,8 @@ static void padata_replace(struct padata_instance *pinst, if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; - padata_flush_queues(pd_old); - padata_free_pd(pd_old); + if (atomic_dec_and_test(&pd_old->refcnt)) + padata_free_pd(pd_old); if (notification_mask) blocking_notifier_call_chain(&pinst->cpumask_change_notifier, @@ -661,8 +620,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, struct cpumask *serial_mask, *parallel_mask; int err = -EINVAL; - mutex_lock(&pinst->lock); get_online_cpus(); + mutex_lock(&pinst->lock); switch (cpumask_type) { case PADATA_CPU_PARALLEL: @@ -680,8 +639,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: - put_online_cpus(); mutex_unlock(&pinst->lock); + put_online_cpus(); return err; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4335e7d1c391..500ba8b970e4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -322,3 +322,7 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool + +config DEDUCE_WAKEUP_REASONS + bool + default n diff --git a/kernel/power/process.c b/kernel/power/process.c index cc177142a08f..372de061dda2 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -37,9 +37,6 @@ static int try_to_freeze_tasks(bool user_only) unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; -#ifdef CONFIG_PM_SLEEP - char suspend_abort[MAX_SUSPEND_ABORT_LEN]; -#endif do_gettimeofday(&start); @@ -69,11 +66,6 @@ static int try_to_freeze_tasks(bool user_only) break; if (pm_wakeup_pending()) { -#ifdef CONFIG_PM_SLEEP - pm_get_active_wakeup_sources(suspend_abort, - MAX_SUSPEND_ABORT_LEN); - log_suspend_abort_reason(suspend_abort); -#endif wakeup = true; break; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 58209d8bfc56..6e7832ee6d74 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -287,6 +287,7 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; + log_suspend_abort_reason("One or more tasks refusing to freeze"); suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: @@ -316,7 +317,6 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - char suspend_abort[MAX_SUSPEND_ABORT_LEN]; int error, last_dev; error = platform_suspend_prepare(state); @@ -385,11 +385,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) state, false); events_check_enabled = false; } else if (*wakeup) { - pm_get_active_wakeup_sources(suspend_abort, - MAX_SUSPEND_ABORT_LEN); - log_suspend_abort_reason(suspend_abort); error = -EBUSY; } + + start_logging_wakeup_reasons(); syscore_resume(); } diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c index 252611fad2fe..44d8da2952c5 100644 --- a/kernel/power/wakeup_reason.c +++ b/kernel/power/wakeup_reason.c @@ -26,42 +26,232 @@ #include <linux/spinlock.h> #include <linux/notifier.h> #include <linux/suspend.h> - +#include <linux/slab.h> #define MAX_WAKEUP_REASON_IRQS 32 -static int irq_list[MAX_WAKEUP_REASON_IRQS]; -static int irqcount; static bool suspend_abort; static char abort_reason[MAX_SUSPEND_ABORT_LEN]; + +static struct wakeup_irq_node *base_irq_nodes; +static struct wakeup_irq_node *cur_irq_tree; +static int cur_irq_tree_depth; +static LIST_HEAD(wakeup_irqs); + +static struct kmem_cache *wakeup_irq_nodes_cache; static struct kobject *wakeup_reason; -static DEFINE_SPINLOCK(resume_reason_lock); +static spinlock_t resume_reason_lock; +bool log_wakeups __read_mostly; +struct completion wakeups_completion; static ktime_t last_monotime; /* monotonic time before last suspend */ static ktime_t curr_monotime; /* monotonic time after last suspend */ static ktime_t last_stime; /* monotonic boottime offset before last suspend */ static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ -static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +static void init_wakeup_irq_node(struct wakeup_irq_node *p, int irq) { - int irq_no, buf_offset = 0; - struct irq_desc *desc; - spin_lock(&resume_reason_lock); - if (suspend_abort) { - buf_offset = sprintf(buf, "Abort: %s", abort_reason); - } else { - for (irq_no = 0; irq_no < irqcount; irq_no++) { - desc = irq_to_desc(irq_list[irq_no]); - if (desc && desc->action && desc->action->name) - buf_offset += sprintf(buf + buf_offset, "%d %s\n", - irq_list[irq_no], desc->action->name); - else - buf_offset += sprintf(buf + buf_offset, "%d\n", - irq_list[irq_no]); + p->irq = irq; + p->desc = irq_to_desc(irq); + p->child = NULL; + p->parent = NULL; + p->handled = false; + INIT_LIST_HEAD(&p->siblings); + INIT_LIST_HEAD(&p->next); +} + +static struct wakeup_irq_node* alloc_irq_node(int irq) +{ + struct wakeup_irq_node *n; + + n = kmem_cache_alloc(wakeup_irq_nodes_cache, GFP_ATOMIC); + if (!n) { + pr_warning("Failed to log chained wakeup IRQ %d\n", + irq); + return NULL; + } + + init_wakeup_irq_node(n, irq); + return n; +} + +static struct wakeup_irq_node * +search_siblings(struct wakeup_irq_node *root, int irq) +{ + bool found = false; + struct wakeup_irq_node *n = NULL; + BUG_ON(!root); + + if (root->irq == irq) + return root; + + list_for_each_entry(n, &root->siblings, siblings) { + if (n->irq == irq) { + found = true; + break; } } - spin_unlock(&resume_reason_lock); - return buf_offset; + + return found ? n : NULL; +} + +static struct wakeup_irq_node * +add_to_siblings(struct wakeup_irq_node *root, int irq) +{ + struct wakeup_irq_node *n; + if (root) { + n = search_siblings(root, irq); + if (n) + return n; + } + n = alloc_irq_node(irq); + + if (n && root) + list_add(&n->siblings, &root->siblings); + return n; +} + +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS +static struct wakeup_irq_node* add_child(struct wakeup_irq_node *root, int irq) +{ + if (!root->child) { + root->child = alloc_irq_node(irq); + if (!root->child) + return NULL; + root->child->parent = root; + return root->child; + } + + return add_to_siblings(root->child, irq); +} + +static struct wakeup_irq_node *find_first_sibling(struct wakeup_irq_node *node) +{ + struct wakeup_irq_node *n; + if (node->parent) + return node; + list_for_each_entry(n, &node->siblings, siblings) { + if (n->parent) + return n; + } + return NULL; +} + +static struct wakeup_irq_node * +get_base_node(struct wakeup_irq_node *node, unsigned depth) +{ + if (!node) + return NULL; + + while (depth) { + node = find_first_sibling(node); + BUG_ON(!node); + node = node->parent; + depth--; + } + + return node; +} +#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */ + +static const struct list_head* get_wakeup_reasons_nosync(void); + +static void print_wakeup_sources(void) +{ + struct wakeup_irq_node *n; + const struct list_head *wakeups; + + if (suspend_abort) { + pr_info("Abort: %s\n", abort_reason); + return; + } + + wakeups = get_wakeup_reasons_nosync(); + list_for_each_entry(n, wakeups, next) { + if (n->desc && n->desc->action && n->desc->action->name) + pr_info("Resume caused by IRQ %d, %s\n", n->irq, + n->desc->action->name); + else + pr_info("Resume caused by IRQ %d\n", n->irq); + } +} + +static bool walk_irq_node_tree(struct wakeup_irq_node *root, + bool (*visit)(struct wakeup_irq_node *, void *), + void *cookie) +{ + struct wakeup_irq_node *n, *t; + + if (!root) + return true; + + list_for_each_entry_safe(n, t, &root->siblings, siblings) { + if (!walk_irq_node_tree(n->child, visit, cookie)) + return false; + if (!visit(n, cookie)) + return false; + } + + if (!walk_irq_node_tree(root->child, visit, cookie)) + return false; + return visit(root, cookie); +} + +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS +static bool is_node_handled(struct wakeup_irq_node *n, void *_p) +{ + return n->handled; +} + +static bool base_irq_nodes_done(void) +{ + return walk_irq_node_tree(base_irq_nodes, is_node_handled, NULL); +} +#endif + +struct buf_cookie { + char *buf; + int buf_offset; +}; + +static bool print_leaf_node(struct wakeup_irq_node *n, void *_p) +{ + struct buf_cookie *b = _p; + if (!n->child) { + if (n->desc && n->desc->action && n->desc->action->name) + b->buf_offset += + snprintf(b->buf + b->buf_offset, + PAGE_SIZE - b->buf_offset, + "%d %s\n", + n->irq, n->desc->action->name); + else + b->buf_offset += + snprintf(b->buf + b->buf_offset, + PAGE_SIZE - b->buf_offset, + "%d\n", + n->irq); + } + return true; +} + +static ssize_t last_resume_reason_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long flags; + + struct buf_cookie b = { + .buf = buf, + .buf_offset = 0 + }; + + spin_lock_irqsave(&resume_reason_lock, flags); + if (suspend_abort) + b.buf_offset = snprintf(buf, PAGE_SIZE, "Abort: %s", abort_reason); + else + walk_irq_node_tree(base_irq_nodes, print_leaf_node, &b); + spin_unlock_irqrestore(&resume_reason_lock, flags); + + return b.buf_offset; } static ssize_t last_suspend_time_show(struct kobject *kobj, @@ -104,56 +294,155 @@ static struct attribute_group attr_group = { .attrs = attrs, }; +static inline void stop_logging_wakeup_reasons(void) +{ + ACCESS_ONCE(log_wakeups) = false; + smp_wmb(); +} + /* - * logs all the wake up reasons to the kernel - * stores the irqs to expose them to the userspace via sysfs + * stores the immediate wakeup irqs; these often aren't the ones seen by + * the drivers that registered them, due to chained interrupt controllers, + * and multiple-interrupt dispatch. */ -void log_wakeup_reason(int irq) +void log_base_wakeup_reason(int irq) { - struct irq_desc *desc; - desc = irq_to_desc(irq); - if (desc && desc->action && desc->action->name) - printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq, - desc->action->name); - else - printk(KERN_INFO "Resume caused by IRQ %d\n", irq); + /* No locking is needed, since this function is called within + * syscore_resume, with both nonboot CPUs and interrupts disabled. + */ + base_irq_nodes = add_to_siblings(base_irq_nodes, irq); + BUG_ON(!base_irq_nodes); +#ifndef CONFIG_DEDUCE_WAKEUP_REASONS + base_irq_nodes->handled = true; +#endif +} - spin_lock(&resume_reason_lock); - if (irqcount == MAX_WAKEUP_REASON_IRQS) { - spin_unlock(&resume_reason_lock); - printk(KERN_WARNING "Resume caused by more than %d IRQs\n", - MAX_WAKEUP_REASON_IRQS); - return; +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS + +/* This function is called by generic_handle_irq, which may call itself + * recursively. This happens with interrupts disabled. Using + * log_possible_wakeup_reason, we build a tree of interrupts, tracing the call + * stack of generic_handle_irq, for each wakeup source containing the + * interrupts actually handled. + * + * Most of these "trees" would either have a single node (in the event that the + * wakeup source is the final interrupt), or consist of a list of two + * interrupts, with the wakeup source at the root, and the final dispatched + * interrupt at the leaf. + * + * When *all* wakeup sources have been thusly spoken for, this function will + * clear the log_wakeups flag, and print the wakeup reasons. + + TODO: percpu + + */ + +static struct wakeup_irq_node * +log_possible_wakeup_reason_start(int irq, struct irq_desc *desc, unsigned depth) +{ + BUG_ON(!irqs_disabled()); + BUG_ON((signed)depth < 0); + + /* This function can race with a call to stop_logging_wakeup_reasons() + * from a thread context. If this happens, just exit silently, as we are no + * longer interested in logging interrupts. + */ + if (!logging_wakeup_reasons()) + return NULL; + + /* If suspend was aborted, the base IRQ nodes are missing, and we stop + * logging interrupts immediately. + */ + if (!base_irq_nodes) { + stop_logging_wakeup_reasons(); + return NULL; + } + + /* We assume wakeup interrupts are handlerd only by the first core. */ + /* TODO: relax this by having percpu versions of the irq tree */ + if (smp_processor_id() != 0) { + return NULL; } - irq_list[irqcount++] = irq; - spin_unlock(&resume_reason_lock); + if (depth == 0) { + cur_irq_tree_depth = 0; + cur_irq_tree = search_siblings(base_irq_nodes, irq); + } + else if (cur_irq_tree) { + if (depth > cur_irq_tree_depth) { + BUG_ON(depth - cur_irq_tree_depth > 1); + cur_irq_tree = add_child(cur_irq_tree, irq); + if (cur_irq_tree) + cur_irq_tree_depth++; + } + else { + cur_irq_tree = get_base_node(cur_irq_tree, + cur_irq_tree_depth - depth); + cur_irq_tree_depth = depth; + cur_irq_tree = add_to_siblings(cur_irq_tree, irq); + } + } + + return cur_irq_tree; } -int check_wakeup_reason(int irq) +static void log_possible_wakeup_reason_complete(struct wakeup_irq_node *n, + unsigned depth, + bool handled) { - int irq_no; - int ret = false; - - spin_lock(&resume_reason_lock); - for (irq_no = 0; irq_no < irqcount; irq_no++) - if (irq_list[irq_no] == irq) { - ret = true; - break; + if (!n) + return; + n->handled = handled; + if (depth == 0) { + if (base_irq_nodes_done()) { + stop_logging_wakeup_reasons(); + complete(&wakeups_completion); + print_wakeup_sources(); + } } - spin_unlock(&resume_reason_lock); - return ret; } +bool log_possible_wakeup_reason(int irq, + struct irq_desc *desc, + bool (*handler)(struct irq_desc *)) +{ + static DEFINE_PER_CPU(unsigned int, depth); + + struct wakeup_irq_node *n; + bool handled; + unsigned d; + + d = get_cpu_var(depth)++; + put_cpu_var(depth); + + n = log_possible_wakeup_reason_start(irq, desc, d); + + handled = handler(desc); + + d = --get_cpu_var(depth); + put_cpu_var(depth); + + if (!handled && desc && desc->action) + pr_debug("%s: irq %d action %pF not handled\n", __func__, + irq, desc->action->handler); + + log_possible_wakeup_reason_complete(n, d, handled); + + return handled; +} + +#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */ + void log_suspend_abort_reason(const char *fmt, ...) { va_list args; + unsigned long flags; - spin_lock(&resume_reason_lock); + spin_lock_irqsave(&resume_reason_lock, flags); //Suspend abort reason has already been logged. if (suspend_abort) { - spin_unlock(&resume_reason_lock); + spin_unlock_irqrestore(&resume_reason_lock, flags); return; } @@ -161,29 +450,128 @@ void log_suspend_abort_reason(const char *fmt, ...) va_start(args, fmt); vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args); va_end(args); - spin_unlock(&resume_reason_lock); + + spin_unlock_irqrestore(&resume_reason_lock, flags); +} + +static bool match_node(struct wakeup_irq_node *n, void *_p) +{ + int irq = *((int *)_p); + return n->irq != irq; +} + +int check_wakeup_reason(int irq) +{ + bool found; + unsigned long flags; + spin_lock_irqsave(&resume_reason_lock, flags); + found = !walk_irq_node_tree(base_irq_nodes, match_node, &irq); + spin_unlock_irqrestore(&resume_reason_lock, flags); + return found; +} + +static bool build_leaf_nodes(struct wakeup_irq_node *n, void *_p) +{ + struct list_head *wakeups = _p; + if (!n->child) + list_add(&n->next, wakeups); + return true; +} + +static const struct list_head* get_wakeup_reasons_nosync(void) +{ + BUG_ON(logging_wakeup_reasons()); + INIT_LIST_HEAD(&wakeup_irqs); + walk_irq_node_tree(base_irq_nodes, build_leaf_nodes, &wakeup_irqs); + return &wakeup_irqs; +} + +static bool build_unfinished_nodes(struct wakeup_irq_node *n, void *_p) +{ + struct list_head *unfinished = _p; + if (!n->handled) { + pr_warning("%s: wakeup irq %d was not handled\n", + __func__, n->irq); + list_add(&n->next, unfinished); + } + return true; +} + +const struct list_head* get_wakeup_reasons(unsigned long timeout, + struct list_head *unfinished) +{ + INIT_LIST_HEAD(unfinished); + + if (logging_wakeup_reasons()) { + unsigned long signalled = 0; + if (timeout) + signalled = wait_for_completion_timeout(&wakeups_completion, timeout); + if (WARN_ON(!signalled)) { + stop_logging_wakeup_reasons(); + walk_irq_node_tree(base_irq_nodes, build_unfinished_nodes, unfinished); + return NULL; + } + pr_info("%s: waited for %u ms\n", + __func__, + jiffies_to_msecs(timeout - signalled)); + } + + return get_wakeup_reasons_nosync(); +} + +static bool delete_node(struct wakeup_irq_node *n, void *unused) +{ + list_del(&n->siblings); + kmem_cache_free(wakeup_irq_nodes_cache, n); + return true; +} + +void clear_wakeup_reasons(void) +{ + unsigned long flags; + spin_lock_irqsave(&resume_reason_lock, flags); + + BUG_ON(logging_wakeup_reasons()); + walk_irq_node_tree(base_irq_nodes, delete_node, NULL); + base_irq_nodes = NULL; + cur_irq_tree = NULL; + cur_irq_tree_depth = 0; + INIT_LIST_HEAD(&wakeup_irqs); + suspend_abort = false; + + spin_unlock_irqrestore(&resume_reason_lock, flags); } /* Detects a suspend and clears all the previous wake up reasons*/ static int wakeup_reason_pm_event(struct notifier_block *notifier, unsigned long pm_event, void *unused) { + unsigned long flags; switch (pm_event) { case PM_SUSPEND_PREPARE: - spin_lock(&resume_reason_lock); - irqcount = 0; + spin_lock_irqsave(&resume_reason_lock, flags); suspend_abort = false; - spin_unlock(&resume_reason_lock); + spin_unlock_irqrestore(&resume_reason_lock, flags); /* monotonic time since boot */ last_monotime = ktime_get(); /* monotonic time since boot including the time spent in suspend */ last_stime = ktime_get_boottime(); + clear_wakeup_reasons(); break; case PM_POST_SUSPEND: /* monotonic time since boot */ curr_monotime = ktime_get(); /* monotonic time since boot including the time spent in suspend */ curr_stime = ktime_get_boottime(); +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS + /* log_wakeups should have been cleared by now. */ + if (WARN_ON(logging_wakeup_reasons())) { + stop_logging_wakeup_reasons(); + print_wakeup_sources(); + } +#else + print_wakeup_sources(); +#endif break; default: break; @@ -195,31 +583,46 @@ static struct notifier_block wakeup_reason_pm_notifier_block = { .notifier_call = wakeup_reason_pm_event, }; -/* Initializes the sysfs parameter - * registers the pm_event notifier - */ int __init wakeup_reason_init(void) { - int retval; + spin_lock_init(&resume_reason_lock); - retval = register_pm_notifier(&wakeup_reason_pm_notifier_block); - if (retval) - printk(KERN_WARNING "[%s] failed to register PM notifier %d\n", - __func__, retval); + if (register_pm_notifier(&wakeup_reason_pm_notifier_block)) { + pr_warning("[%s] failed to register PM notifier\n", + __func__); + goto fail; + } wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj); if (!wakeup_reason) { - printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n", + pr_warning("[%s] failed to create a sysfs kobject\n", __func__); - return 1; + goto fail_unregister_pm_notifier; } - retval = sysfs_create_group(wakeup_reason, &attr_group); - if (retval) { - kobject_put(wakeup_reason); - printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n", - __func__, retval); + + if (sysfs_create_group(wakeup_reason, &attr_group)) { + pr_warning("[%s] failed to create a sysfs group\n", + __func__); + goto fail_kobject_put; } + + wakeup_irq_nodes_cache = + kmem_cache_create("wakeup_irq_node_cache", + sizeof(struct wakeup_irq_node), 0, + 0, NULL); + if (!wakeup_irq_nodes_cache) + goto fail_remove_group; + return 0; + +fail_remove_group: + sysfs_remove_group(wakeup_reason, &attr_group); +fail_kobject_put: + kobject_put(wakeup_reason); +fail_unregister_pm_notifier: + unregister_pm_notifier(&wakeup_reason_pm_notifier_block); +fail: + return 1; } late_initcall(wakeup_reason_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fd63b4d06139..4bbafc2a4822 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -497,7 +497,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, int source) +static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -525,7 +525,6 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } -EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6f8bb2f0d95..72e1ffe809f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2132,6 +2132,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); raw_spin_unlock(&rq->lock); rcu_read_lock(); @@ -2225,6 +2226,7 @@ static void try_to_wake_up_local(struct task_struct *p) update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); note_task_waking(p, wallclock); } @@ -3196,6 +3198,8 @@ void scheduler_tick(void) calc_global_load_tick(rq); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + + cpufreq_update_util(rq, 0); early_notif = early_detection_notify(rq, wallclock); raw_spin_unlock(&rq->lock); @@ -3564,6 +3568,7 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + cpufreq_update_util(rq, 0); if (!is_idle_task(prev) && !prev->on_rq) update_avg_burst(prev); @@ -3582,6 +3587,7 @@ static void __sched notrace __schedule(bool preempt) cpu = cpu_of(rq); } else { update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); + cpufreq_update_util(rq, 0); lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index ce15ae7fe76b..99f16128cf49 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -22,6 +22,7 @@ #include <linux/sched/rt.h> #include <trace/events/sched.h> +#include "sched.h" #define MAX_CPUS_PER_CLUSTER 4 #define MAX_CLUSTERS 2 @@ -575,7 +576,8 @@ static bool eval_need(struct cluster_data *cluster) cluster->active_cpus = get_active_cpu_count(cluster); thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; list_for_each_entry(c, &cluster->lru, sib) { - if (c->busy >= cluster->busy_up_thres[thres_idx]) + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) c->is_busy = true; else if (c->busy < cluster->busy_down_thres[thres_idx]) c->is_busy = false; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df2e6dd2c665..3f2b5c04623c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3001,6 +3001,8 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats) int i; struct cpumask search_cpus; + extern int num_clusters; + while (!bitmap_empty(env->backup_list, num_clusters)) { next = next_candidate(env->backup_list, 0, num_clusters); __clear_bit(next->id, env->backup_list); @@ -3024,6 +3026,8 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, { struct sched_cluster *next = NULL; + extern int num_clusters; + __clear_bit(cluster->id, env->candidate_list); if (env->rtg && preferred_cluster(cluster, env->p)) @@ -5673,20 +5677,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -10536,7 +10548,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); @@ -10550,6 +10561,12 @@ more_balance: } /* + * Set loop_max when rq's lock is taken to prevent a race. + */ + env.loop_max = min(sysctl_sched_nr_migrate, + busiest->nr_running); + + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5337ac7fcba1..598656b42203 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -3217,6 +3217,13 @@ void sched_get_cpus_busy(struct sched_load *busy, update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0); + /* + * Ensure that we don't report load for 'cpu' again via the + * cpufreq_update_util path in the window that started at + * rq->window_start + */ + rq->load_reported_window = rq->window_start; + account_load_subtractions(rq); load[i] = rq->prev_runnable_sum; nload[i] = rq->nt_prev_runnable_sum; @@ -3649,6 +3656,11 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) migrate_top_tasks(p, src_rq, dest_rq); + if (!same_freq_domain(new_cpu, task_cpu(p))) { + cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + } + if (p == src_rq->ed_task) { src_rq->ed_task = NULL; if (!dest_rq->ed_task) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 07a3cd3c6fbc..90cc450dff7e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -430,7 +430,6 @@ struct related_thread_group { }; extern struct list_head cluster_head; -extern int num_clusters; extern struct sched_cluster *sched_cluster[NR_CPUS]; struct cpu_cycle { @@ -441,6 +440,7 @@ struct cpu_cycle { #define for_each_sched_cluster(cluster) \ list_for_each_entry_rcu(cluster, &cluster_head, list) +extern unsigned int sched_disable_window_stats; #endif /* CONFIG_SCHED_HMP */ /* CFS-related fields in a runqueue */ @@ -793,6 +793,7 @@ struct rq { int cstate, wakeup_latency, wakeup_energy; u64 window_start; + u64 load_reported_window; unsigned long hmp_flags; u64 cur_irqload; @@ -1853,7 +1854,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; @@ -2852,6 +2853,18 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; +#ifdef CONFIG_SCHED_HMP + /* + * Skip if we've already reported, but not if this is an inter-cluster + * migration + */ + if (!sched_disable_window_stats && + (rq->load_reported_window == rq->window_start) && + !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) + return; + rq->load_reported_window = rq->window_start; +#endif + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) data->func(data, rq_clock(rq), flags); diff --git a/kernel/signal.c b/kernel/signal.c index 3095b2309876..6aa9ca45ebb1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -79,6 +79,11 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; + /* Only allow kernel generated signals to this kthread */ + if (unlikely((t->flags & PF_KTHREAD) && + (handler == SIG_KTHREAD_KERNEL) && !force)) + return true; + return sig_handler_ignored(handler, sig); } @@ -368,27 +373,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -402,8 +412,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } @@ -1650,7 +1660,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * This is only possible if parent == real_parent. * Check if it has changed security domain. */ - if (tsk->parent_exec_id != tsk->parent->self_exec_id) + if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } diff --git a/kernel/smp.c b/kernel/smp.c index b2ec21c5c9d6..3f300d3fd1f6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/suspend.h> #include "smpboot.h" @@ -766,7 +767,8 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - if (!cpu_isolated(cpu)) + if (suspend_freeze_state == FREEZE_STATE_ENTER || + !cpu_isolated(cpu)) wake_up_if_idle(cpu); } preempt_enable(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b25717eb3d3a..ac0569f7d56a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -98,6 +98,10 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ +#ifdef CONFIG_USB +int deny_new_usb __read_mostly = 0; +EXPORT_SYMBOL(deny_new_usb); +#endif extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -1083,6 +1087,17 @@ static struct ctl_table kern_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_USB + { + .procname = "deny_new_usb", + .data = &deny_new_usb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "ngroups_max", .data = &ngroups_max, @@ -1640,7 +1655,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, .extra2 = &four, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11cc757795cd..8acdd6ae532c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -586,25 +586,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct taskstats *stats; + struct taskstats *stats_new, *stats; - if (sig->stats || thread_group_empty(tsk)) - goto ret; + /* Pairs with smp_store_release() below. */ + stats = smp_load_acquire(&sig->stats); + if (stats || thread_group_empty(tsk)) + return stats; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; + stats = sig->stats; + if (!stats) { + /* + * Pairs with smp_store_release() above and order the + * kmem_cache_zalloc(). + */ + smp_store_release(&sig->stats, stats_new); + stats = stats_new; + stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; + if (stats_new) + kmem_cache_free(taskstats_cache, stats_new); + + return stats; } /* Send pid data out on exit */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 89cc82a38e4d..3b43159ab41f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -272,8 +272,15 @@ static void clocksource_watchdog(unsigned long data) next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); + } out: spin_unlock(&watchdog_lock); } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index e24008c098c6..45a0a26023d4 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -25,8 +25,6 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -168,7 +166,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -184,7 +182,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -206,38 +204,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d176c127f744..0a4f0c09bcbf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1253,7 +1253,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, unsigned long long now = 0; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); + if (cpu_timer_sample_group(clock_idx, tsk, &now)) + return; if (oldval) { /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3c7b7a9bcad1..d8b00f94c63d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -136,6 +136,20 @@ config GENERIC_TRACER bool select TRACING +if TRACING + +config DISABLE_TRACE_PRINTK + bool "Force disable trace_printk() usage" + default y + help + When trace_printk() is used in any of the kernel source, it enables + debugging functions which are not desired for production kernel. + Enabling this option will replace trace_printk() with pr_debug(). + + If in doubt, say Y. + +endif + # # Minimum requirements an architecture has to meet for us to # be able to offer generic tracing facilities: diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c39fc68c4778..54be8790941e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -319,11 +319,12 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } -int blk_trace_remove(struct request_queue *q) +static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; @@ -336,6 +337,17 @@ int blk_trace_remove(struct request_queue *q) return 0; } + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, @@ -546,9 +558,8 @@ err: return ret; } -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -562,11 +573,24 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } return 0; } + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -595,7 +619,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } @@ -603,11 +627,13 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, } #endif -int blk_trace_startstop(struct request_queue *q, int start) +static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -642,8 +668,25 @@ int blk_trace_startstop(struct request_queue *q, int start) return ret; } + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -661,12 +704,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -677,17 +720,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) case BLKTRACESTART: start = 1; case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); + ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); + ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -698,10 +741,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); + mutex_lock(&q->blk_trace_mutex); + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); } + + mutex_unlock(&q->blk_trace_mutex); } /* @@ -722,11 +769,15 @@ void blk_trace_shutdown(struct request_queue *q) static void blk_add_trace_rq(struct request_queue *q, struct request *rq, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct task_struct *tsk = current; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } /* * Use the bio context for all events except ISSUE and @@ -751,6 +802,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, rq->cmd_flags, what, rq->errors, 0, NULL, tsk); } + rcu_read_unlock(); } static void blk_add_trace_rq_abort(void *ignore, @@ -800,11 +852,15 @@ static void blk_add_trace_rq_complete(void *ignore, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct task_struct *tsk = current; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } /* * Not all the pages in the bio are dirtied by the same task but @@ -817,6 +873,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, what, error, 0, NULL, tsk); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -861,11 +918,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL, current); + rcu_read_unlock(); } } @@ -877,28 +937,36 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL, current); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, current); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -911,15 +979,18 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, current); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct task_struct *tsk = current; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -932,6 +1003,7 @@ static void blk_add_trace_split(void *ignore, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), &rpdu, tsk); } + rcu_read_unlock(); } /** @@ -951,12 +1023,16 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; struct task_struct *tsk = current; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); @@ -970,6 +1046,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r, tsk); + rcu_read_unlock(); } /** @@ -990,12 +1067,16 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; struct task_struct *tsk = current; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1009,6 +1090,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, sizeof(r), &r, tsk); + rcu_read_unlock(); } /** @@ -1026,11 +1108,15 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct task_struct *tsk = current; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (bio_has_data(rq->bio) && rq->bio->bi_io_vec && rq->bio->bi_io_vec->bv_page && @@ -1043,6 +1129,7 @@ void blk_add_driver_data(struct request_queue *q, else __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, rq->errors, len, data, tsk); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1534,6 +1621,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1694,6 +1782,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1704,26 +1793,28 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1737,6 +1828,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1765,10 +1857,12 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1780,22 +1874,25 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3dd40c736067..a71bdad638d5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -206,6 +206,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) event->pmu->count) return -EINVAL; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6380ec0453e0..e4c6f89b6b11 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -637,8 +637,7 @@ static int function_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER - avg = rec->time; - do_div(avg, rec->counter); + avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) goto out; #endif @@ -664,7 +663,8 @@ static int function_stat_show(struct seq_file *m, void *v) * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4509cf51fbfd..b0ebb271bae7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5910,12 +5910,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; + if (ref->ref > INT_MAX/2) + return false; + ref->ref++; + return true; } /* Pipe buffer operations for a buffer. */ diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8be66a2b0cac..78346aba6980 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -121,9 +121,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) { struct trace_event_file *event_file = event_file_data(m->private); - if (t == SHOW_AVAILABLE_TRIGGERS) + if (t == SHOW_AVAILABLE_TRIGGERS) { + (*pos)++; return NULL; - + } return seq_list_next(t, &event_file->triggers, pos); } @@ -909,14 +910,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = register_trigger(glob, ops, data, file); - - if (ret > 0 && tracing_alloc_snapshot() != 0) { - unregister_trigger(glob, ops, data, file); - ret = 0; - } + if (tracing_alloc_snapshot() != 0) + return 0; - return ret; + return register_trigger(glob, ops, data, file); } static int diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 78f04e4ad829..927fd4ad5846 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -626,7 +626,7 @@ static void start_wakeup_tracer(struct trace_array *tr) if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); - return; + goto fail_deprobe_sched_switch; } wakeup_reset(tr); @@ -644,6 +644,8 @@ static void start_wakeup_tracer(struct trace_array *tr) printk(KERN_ERR "failed to start wakeup tracer\n"); return; +fail_deprobe_sched_switch: + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 202df6cffcca..f08ec7c6f9e0 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -197,6 +197,11 @@ check_stack(unsigned long ip, unsigned long *stack) local_irq_restore(flags); } +/* Some archs may not define MCOUNT_INSN_SIZE */ +#ifndef MCOUNT_INSN_SIZE +# define MCOUNT_INSN_SIZE 0 +#endif + static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..a2081a7f0c2c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -302,7 +302,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret; + int ret = -EINVAL; if (!trace) return -EINVAL; @@ -313,17 +313,15 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { - if (node->ts == trace) { - mutex_unlock(&all_stat_sessions_mutex); - return -EINVAL; - } + if (node->ts == trace) + goto out; } - mutex_unlock(&all_stat_sessions_mutex); + ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - return -ENOMEM; + goto out; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -332,15 +330,16 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - return ret; + goto out; } + ret = 0; /* Register */ - mutex_lock(&all_stat_sessions_mutex); list_add_tail(&session->session_list, &all_stat_sessions); + out: mutex_unlock(&all_stat_sessions_mutex); - return 0; + return ret; } void unregister_stat_tracer(struct tracer_stat *trace) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 89a0f1171f90..696f0913ba38 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2329,8 +2329,14 @@ repeat: */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); - get_pwq(pwq); - list_move_tail(&pwq->mayday_node, &wq->maydays); + /* + * Queue iff we aren't racing destruction + * and somebody else hasn't queued it already. + */ + if (wq->rescuer && list_empty(&pwq->mayday_node)) { + get_pwq(pwq); + list_add_tail(&pwq->mayday_node, &wq->maydays); + } spin_unlock(&wq_mayday_lock); } } @@ -2919,6 +2925,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2937,20 +2968,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); @@ -3971,9 +3989,29 @@ void destroy_workqueue(struct workqueue_struct *wq) struct pool_workqueue *pwq; int node; + /* + * Remove it from sysfs first so that sanity check failure doesn't + * lead to sysfs name conflicts. + */ + workqueue_sysfs_unregister(wq); + /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ + if (wq->rescuer) { + struct worker *rescuer = wq->rescuer; + + /* this prevents new queueing */ + spin_lock_irq(&wq_mayday_lock); + wq->rescuer = NULL; + spin_unlock_irq(&wq_mayday_lock); + + /* rescuer will empty maydays list before exiting */ + kthread_stop(rescuer->task); + kfree(rescuer); + } + /* sanity checks */ mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { @@ -4003,11 +4041,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - workqueue_sysfs_unregister(wq); - - if (wq->rescuer) - kthread_stop(wq->rescuer->task); - if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly @@ -4284,7 +4317,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + pr_cont(" active=%d/%d refcnt=%d%s\n", + pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { |