summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c43
-rw-r--r--kernel/auditfilter.c71
-rw-r--r--kernel/bpf/syscall.c7
-rw-r--r--kernel/cgroup_pids.c11
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/futex.c105
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/irq/chip.c48
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/irqdesc.c16
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/irq/migration.c26
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/locking/lockdep.c4
-rw-r--r--kernel/locking/locktorture.c8
-rw-r--r--kernel/locking/spinlock_debug.c32
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/padata.c137
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/power/wakeup_reason.c547
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/core_ctl.c4
-rw-r--r--kernel/sched/fair.c47
-rw-r--r--kernel/sched/hmp.c12
-rw-r--r--kernel/sched/sched.h17
-rw-r--r--kernel/signal.c30
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/sysctl.c17
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/posix-clock.c31
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/blktrace.c191
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_events_trigger.c15
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_stat.c19
-rw-r--r--kernel/workqueue.c78
49 files changed, 1205 insertions, 483 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d440c25cb3be..f94484ea1b26 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -753,13 +753,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
audit_log_end(ab);
}
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
{
- struct audit_features *uaf;
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
- uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -815,6 +813,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
+ int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
@@ -838,6 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
+ data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
@@ -859,7 +859,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
@@ -914,7 +914,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
break;
case AUDIT_SET_FEATURE:
- err = audit_set_feature(skb);
+ if (data_len < sizeof(struct audit_features))
+ return -EINVAL;
+ err = audit_set_feature(data);
if (err)
return err;
break;
@@ -923,9 +925,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
err = audit_filter_user(msg_type);
if (err == 1) { /* match or error */
+ char *str = data;
+
err = 0;
if (msg_type == AUDIT_USER_TTY) {
err = tty_audit_push_current();
@@ -934,19 +941,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
mutex_unlock(&audit_cmd_mutex);
audit_log_common_recv_msg(&ab, msg_type);
- if (msg_type != AUDIT_USER_TTY)
+ if (msg_type != AUDIT_USER_TTY) {
+ /* ensure NULL termination */
+ str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
- (char *)data);
- else {
- int size;
-
+ str);
+ } else {
audit_log_format(ab, " data=");
- size = nlmsg_len(nlh);
- if (size > 0 &&
- ((unsigned char *)data)[size - 1] == '\0')
- size--;
- audit_log_n_untrustedstring(ab, data, size);
+ if (data_len > 0 && str[data_len - 1] == '\0')
+ data_len--;
+ audit_log_n_untrustedstring(ab, str, data_len);
}
audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
@@ -955,7 +960,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
@@ -964,7 +969,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EPERM;
}
err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
- seq, data, nlmsg_len(nlh));
+ seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
@@ -978,7 +983,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
- size_t msglen = nlmsg_len(nlh);
+ size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
@@ -1055,7 +1060,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index cf7aa656b308..41a668a9d561 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -434,6 +434,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
+ u32 f_val;
err = -EINVAL;
@@ -442,12 +443,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
f->type = data->fields[i];
- f->val = data->values[i];
+ f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
- f->val = 0;
+ f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
@@ -463,7 +464,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
- f->uid = make_kuid(current_user_ns(), f->val);
+ f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
@@ -472,11 +473,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
- f->gid = make_kgid(current_user_ns(), f->val);
+ f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
+ f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
@@ -489,11 +491,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
+ entry->rule.buflen += f_val;
+ f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
(void **)&f->lsm_rule);
/* Keep currently invalid fields around in case they
@@ -502,68 +506,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
- }
- if (err) {
- kfree(str);
+ } else if (err)
goto exit_free;
- } else
- f->lsm_str = str;
break;
case AUDIT_WATCH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
- err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ }
+ err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
+ entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
+ entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
+ f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
- if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
+ }
+ entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
- if (entry->rule.exe || f->val > PATH_MAX)
+ if (entry->rule.exe || f_val > PATH_MAX)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
+ str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
- entry->rule.buflen += f->val;
-
- audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
goto exit_free;
}
+ entry->rule.buflen += f_val;
entry->rule.exe = audit_mark;
break;
+ default:
+ f->val = f_val;
+ break;
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 04fc1022ad9f..01431ef8cf07 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -152,7 +152,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_charge_memlock(map);
if (err)
- goto free_map;
+ goto free_map_nouncharge;
err = bpf_map_new_fd(map);
if (err < 0)
@@ -162,6 +162,8 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
}
@@ -667,7 +669,7 @@ static int bpf_obj_get(const union bpf_attr *attr)
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
- union bpf_attr attr = {};
+ union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
@@ -703,6 +705,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
}
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167fda..ea8cb03dbf72 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -48,7 +48,7 @@ struct pids_cgroup {
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
*/
atomic64_t counter;
- int64_t limit;
+ atomic64_t limit;
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -70,8 +70,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->limit, PIDS_MAX);
return &pids->css;
}
@@ -142,13 +142,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
for (p = pids; parent_pids(p); p = parent_pids(p)) {
int64_t new = atomic64_add_return(num, &p->counter);
+ int64_t limit = atomic64_read(&p->limit);
/*
* Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > p->limit)
+ if (new > limit)
goto revert;
}
@@ -262,7 +263,7 @@ set_limit:
* Limit updates don't need to be mutex'd, since it isn't
* critical that any racing fork()s follow the new limit.
*/
- pids->limit = limit;
+ atomic64_set(&pids->limit, limit);
return nbytes;
}
@@ -270,7 +271,7 @@ static int pids_max_show(struct seq_file *sf, void *v)
{
struct cgroup_subsys_state *css = seq_css(sf);
struct pids_cgroup *pids = css_pids(css);
- int64_t limit = pids->limit;
+ int64_t limit = atomic64_read(&pids->limit);
if (limit >= PIDS_MAX)
seq_printf(sf, "%s\n", PIDS_MAX_STR);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 98791b70277a..223f8f208df1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -648,7 +648,7 @@ int disable_nonboot_cpus(void)
*/
cpumask_clear(frozen_cpus);
- pr_info("Disabling non-boot CPUs ...\n");
+ pr_debug("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
@@ -698,7 +698,7 @@ void enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- pr_info("Enabling non-boot CPUs ...\n");
+ pr_debug("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
@@ -707,7 +707,7 @@ void enable_nonboot_cpus(void)
error = _cpu_up(cpu, 1);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- pr_info("CPU%d is up\n", cpu);
+ pr_debug("CPU%d is up\n", cpu);
cpu_device = get_cpu_device(cpu);
if (!cpu_device)
pr_err("%s: failed to get cpu%d device\n",
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebc52c7bd8a6..cba287a5c976 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2632,7 +2632,7 @@ static int kdb_per_cpu(int argc, const char **argv)
diag = kdbgetularg(argv[3], &whichcpu);
if (diag)
return diag;
- if (!cpu_online(whichcpu)) {
+ if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
kdb_printf("cpu %ld is not online\n", whichcpu);
return KDB_BADCPUNUM;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1d065a82610d..40d738294f0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5003,7 +5003,15 @@ accounting:
*/
user_lock_limit *= num_online_cpus();
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ user_locked = atomic_long_read(&user->locked_vm);
+
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += user_extra;
if (user_locked > user_lock_limit)
extra = user_locked - user_lock_limit;
@@ -5992,10 +6000,17 @@ static void perf_event_task_output(struct perf_event *event,
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
diff --git a/kernel/futex.c b/kernel/futex.c
index d24d164e9bdb..e3ef6934b37f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -378,9 +378,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
+
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -407,7 +407,7 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies MB (B) */
+ smp_mb(); /* explicit smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies MB (B) */
@@ -438,7 +438,6 @@ static void drop_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- iput(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
@@ -446,6 +445,46 @@ static void drop_futex_key_refs(union futex_key *key)
}
}
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that match_futex() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
@@ -458,9 +497,15 @@ static void drop_futex_key_refs(union futex_key *key)
*
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, file_inode(vma->vm_file),
- * offset_within_page). For private mappings, it's (uaddr, current->mm).
- * We can usually work out the index without swapping in the page.
+ * For shared mappings (when @fshared), the key is:
+ * ( inode->i_sequence, page->index, offset_within_page )
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
@@ -628,8 +673,6 @@ again:
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */
-
} else {
struct inode *inode;
@@ -661,40 +704,14 @@ again:
goto again;
}
- /*
- * Take a reference unless it is about to be freed. Previously
- * this reference was taken by ihold under the page lock
- * pinning the inode in place so i_lock was unnecessary. The
- * only way for this check to fail is if the inode was
- * truncated in parallel which is almost certainly an
- * application bug. In such a case, just retry.
- *
- * We are not calling into get_futex_key_refs() in file-backed
- * cases, therefore a successful atomic_inc return below will
- * guarantee that get_futex_key() will still imply smp_mb(); (B).
- */
- if (!atomic_inc_not_zero(&inode->i_count)) {
- rcu_read_unlock();
- put_page(page_head);
-
- goto again;
- }
-
- /* Should be impossible but lets be paranoid for now */
- if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
- err = -EFAULT;
- rcu_read_unlock();
- iput(inode);
-
- goto out;
- }
-
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = inode;
+ key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = basepage_index(page);
rcu_read_unlock();
}
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
out:
put_page(page_head);
return err;
@@ -1462,8 +1479,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
int oldval, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31)
- return -EINVAL;
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
oparg = 1 << oparg;
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index edf67c493a8e..e473f6a1f6ca 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5a2932713d0c..f4c99c41fee2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -33,6 +33,7 @@ static irqreturn_t bad_chained_irq(int irq, void *dev_id)
*/
struct irqaction chained_action = {
.handler = bad_chained_irq,
+ .name = "chained-irq",
};
/**
@@ -327,11 +328,12 @@ void unmask_threaded_irq(struct irq_desc *desc)
* handler. The handler function is called inside the calling
* threads context.
*/
-void handle_nested_irq(unsigned int irq)
+bool handle_nested_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
irqreturn_t action_ret;
+ bool handled = false;
might_sleep();
@@ -356,8 +358,11 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ handled = true;
+
out_unlock:
raw_spin_unlock_irq(&desc->lock);
+ return handled;
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
@@ -404,8 +409,10 @@ static bool irq_may_run(struct irq_desc *desc)
* Note: The caller is expected to handle the ack, clear, mask and
* unmask issues if necessary.
*/
-void handle_simple_irq(struct irq_desc *desc)
+bool handle_simple_irq(struct irq_desc *desc)
{
+ bool handled = false;
+
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -421,8 +428,11 @@ void handle_simple_irq(struct irq_desc *desc)
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
+ handled = true;
+
out_unlock:
raw_spin_unlock(&desc->lock);
+ return handled;
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
@@ -453,8 +463,10 @@ static void cond_unmask_irq(struct irq_desc *desc)
* it after the associated handler has acknowledged the device, so the
* interrupt line is back to inactive.
*/
-void handle_level_irq(struct irq_desc *desc)
+bool handle_level_irq(struct irq_desc *desc)
{
+ bool handled = false;
+
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
@@ -477,8 +489,11 @@ void handle_level_irq(struct irq_desc *desc)
cond_unmask_irq(desc);
+ handled = true;
+
out_unlock:
raw_spin_unlock(&desc->lock);
+ return handled;
}
EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -522,9 +537,10 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
* for modern forms of interrupt handlers, which handle the flow
* details in hardware, transparently.
*/
-void handle_fasteoi_irq(struct irq_desc *desc)
+bool handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
+ bool handled = false;
raw_spin_lock(&desc->lock);
@@ -552,12 +568,15 @@ void handle_fasteoi_irq(struct irq_desc *desc)
cond_unmask_eoi_irq(desc, chip);
+ handled = true;
+
raw_spin_unlock(&desc->lock);
- return;
+ return handled;
out:
if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
+ return handled;
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
@@ -576,8 +595,10 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
* the handler was running. If all pending interrupts are handled, the
* loop is left.
*/
-void handle_edge_irq(struct irq_desc *desc)
+bool handle_edge_irq(struct irq_desc *desc)
{
+ bool handled = false;
+
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
@@ -621,12 +642,14 @@ void handle_edge_irq(struct irq_desc *desc)
}
handle_irq_event(desc);
+ handled = true;
} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));
out_unlock:
raw_spin_unlock(&desc->lock);
+ return handled;
}
EXPORT_SYMBOL(handle_edge_irq);
@@ -638,8 +661,9 @@ EXPORT_SYMBOL(handle_edge_irq);
* Similar as the above handle_edge_irq, but using eoi and w/o the
* mask/unmask logic.
*/
-void handle_edge_eoi_irq(struct irq_desc *desc)
+bool handle_edge_eoi_irq(struct irq_desc *desc)
{
+ bool handled = false;
struct irq_chip *chip = irq_desc_get_chip(desc);
raw_spin_lock(&desc->lock);
@@ -667,6 +691,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc)
goto out_eoi;
handle_irq_event(desc);
+ handled = true;
} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));
@@ -674,6 +699,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc)
out_eoi:
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
+ return handled;
}
#endif
@@ -683,7 +709,7 @@ out_eoi:
*
* Per CPU interrupts on SMP machines without locking requirements
*/
-void handle_percpu_irq(struct irq_desc *desc)
+bool handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
@@ -696,6 +722,8 @@ void handle_percpu_irq(struct irq_desc *desc)
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
+
+ return true;
}
/**
@@ -709,7 +737,7 @@ void handle_percpu_irq(struct irq_desc *desc)
* contain the real device id for the cpu on which this handler is
* called
*/
-void handle_percpu_devid_irq(struct irq_desc *desc)
+bool handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
@@ -728,6 +756,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
+
+ return true;
}
void
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 57bff7857e87..80e76ddbf804 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -26,13 +26,14 @@
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
-void handle_bad_irq(struct irq_desc *desc)
+bool handle_bad_irq(struct irq_desc *desc)
{
unsigned int irq = irq_desc_get_irq(desc);
print_irq_desc(irq, desc);
kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
+ return true;
}
EXPORT_SYMBOL_GPL(handle_bad_irq);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 239e2ae2c947..52fbf88cd2d8 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,7 @@
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
#include <linux/irqdomain.h>
+#include <linux/wakeup_reason.h>
#include "internals.h"
@@ -339,16 +340,25 @@ void irq_init_desc(unsigned int irq)
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
- *
+ * returns:
+ * negative on error
+ * 0 when the interrupt handler was not called
+ * 1 when the interrupt handler was called
*/
+
int generic_handle_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
- generic_handle_irq_desc(desc);
- return 0;
+
+ if (unlikely(logging_wakeup_reasons_nosync()))
+ return log_possible_wakeup_reason(irq,
+ desc,
+ generic_handle_irq_desc);
+
+ return generic_handle_irq_desc(desc);
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d0193c0b2531..4746500d65ec 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -235,7 +235,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
- schedule_work(&desc->affinity_notify->work);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref,
+ desc->affinity_notify->release);
+ }
}
irqd_set(data, IRQD_AFFINITY_SET);
@@ -335,7 +339,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (old_notify) {
- cancel_work_sync(&old_notify->work);
+ if (cancel_work_sync(&old_notify->work)) {
+ /* Pending work had a ref, put that one too */
+ kref_put(&old_notify->kref, old_notify->release);
+ }
kref_put(&old_notify->kref, old_notify->release);
}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 37ddb7bda651..ec7c7eda0774 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,17 +7,18 @@
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = desc->irq_data.chip;
+ struct irq_data *data = &desc->irq_data;
+ struct irq_chip *chip = data->chip;
- if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+ if (likely(!irqd_is_setaffinity_pending(data)))
return;
- irqd_clr_move_pending(&desc->irq_data);
+ irqd_clr_move_pending(data);
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (irqd_is_per_cpu(&desc->irq_data)) {
+ if (irqd_is_per_cpu(data)) {
WARN_ON(1);
return;
}
@@ -42,9 +43,20 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
- irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+ int ret;
+
+ ret = irq_do_set_affinity(data, desc->pending_mask, false);
+ /*
+ * If the there is a cleanup pending in the underlying
+ * vector management, reschedule the move for the next
+ * interrupt. Leave desc->pending_mask intact.
+ */
+ if (ret == -EBUSY) {
+ irqd_set_move_pending(data);
+ return;
+ }
+ }
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..28e134310435 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -11,7 +11,7 @@
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
-
+#include <linux/wakeup_reason.h>
#include "internals.h"
bool irq_pm_check_wakeup(struct irq_desc *desc)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0277d1216f80..e4e5e98002fe 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,7 +119,7 @@ out:
* invoke it.
*
* If module auto-loading support is disabled then this function
- * becomes a no-operation.
+ * simply returns -ENOENT.
*/
int __request_module(bool wait, const char *fmt, ...)
{
@@ -140,7 +140,7 @@ int __request_module(bool wait, const char *fmt, ...)
WARN_ON_ONCE(wait && current_is_async());
if (!modprobe_path[0])
- return 0;
+ return -ENOENT;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a419696709a1..0a00720d3ccc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1265,9 +1265,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_forward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
@@ -1292,9 +1294,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_backward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index d580b7d6ee6d..ad5aea269f76 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -655,10 +655,10 @@ static void __torture_print_stats(char *page,
if (statp[i].n_lock_fail)
fail = true;
sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index d381f559e0ce..989991f00dc7 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -53,19 +53,19 @@ EXPORT_SYMBOL(__rwlock_init);
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG
msm_trigger_wdog_bite();
#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG)
@@ -87,16 +87,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -106,8 +106,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
static void __spin_lock_debug(raw_spinlock_t *lock)
@@ -245,8 +245,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -255,8 +255,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
#if 0 /* This can cause lockups */
diff --git a/kernel/module.c b/kernel/module.c
index ad4928210e28..3b7aac2ea1e5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1014,6 +1014,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
return 0;
out:
mutex_unlock(&module_mutex);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index fd2c9acbcc19..0f70f1b6fdaa 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -552,7 +552,7 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/padata.c b/kernel/padata.c
index 282b489a286d..c50975f43b34 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -33,6 +33,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -63,15 +65,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
static void padata_parallel_worker(struct work_struct *parallel_work)
{
struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
pqueue = container_of(parallel_work,
struct padata_parallel_queue, work);
- pd = pqueue->pd;
- pinst = pd->pinst;
spin_lock(&pqueue->parallel.lock);
list_replace_init(&pqueue->parallel.list, &local_list);
@@ -134,6 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->cb_cpu = cb_cpu;
target_cpu = padata_cpu_hash(pd);
+ padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -157,8 +156,6 @@ EXPORT_SYMBOL(padata_do_parallel);
* A pointer to the control struct of the next object that needs
* serialization, if present in one of the percpu reorder queues.
*
- * NULL, if all percpu reorder queues are empty.
- *
* -EINPROGRESS, if the next object that needs serialization will
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
@@ -168,25 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel);
*/
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
+ int cpu = pd->cpu;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
- padata = NULL;
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
@@ -197,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- pd->processed++;
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1,
+ false);
spin_unlock(&reorder->lock);
goto out;
@@ -220,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_priv *padata;
struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
+ struct padata_parallel_queue *next_queue;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -238,12 +224,11 @@ static void padata_reorder(struct parallel_data *pd)
padata = padata_get_next(pd);
/*
- * All reorder queues are empty, or the next object that needs
- * serialization is parallel processed by another cpu and is
- * still on it's way to the cpu's reorder queue, nothing to
- * do for now.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, nothing to do for now.
*/
- if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ if (PTR_ERR(padata) == -EINPROGRESS)
break;
/*
@@ -252,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd)
* so exit immediately.
*/
if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
return;
}
@@ -271,28 +255,27 @@ static void padata_reorder(struct parallel_data *pd)
/*
* The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
+ * the reorder queues in the meantime.
*
- * Ensure reorder_objects is read after pd->lock is dropped so we see
- * an increment from another task in padata_do_serial. Pairs with
+ * Ensure reorder queue is read after pd->lock is dropped so we see
+ * new objects from another task in padata_do_serial. Pairs with
* smp_mb__after_atomic in padata_do_serial.
*/
smp_mb();
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
- return;
+ next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
+ if (!list_empty(&next_queue->reorder.list))
+ queue_work(pinst->wq, &pd->reorder_work);
}
-static void padata_reorder_timer(unsigned long arg)
+static void invoke_padata_reorder(struct work_struct *work)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct parallel_data *pd;
+ local_bh_disable();
+ pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
+ local_bh_enable();
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -300,6 +283,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -309,6 +293,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -318,9 +304,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -333,29 +322,22 @@ static void padata_serial_worker(struct work_struct *serial_work)
*/
void padata_do_serial(struct padata_priv *padata)
{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
-
- pd = padata->pd;
-
- cpu = get_cpu();
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ struct parallel_data *pd = padata->pd;
+ struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
+ padata->cpu);
spin_lock(&pqueue->reorder.lock);
- atomic_inc(&pd->reorder_objects);
list_add_tail(&padata->list, &pqueue->reorder.list);
+ atomic_inc(&pd->reorder_objects);
spin_unlock(&pqueue->reorder.lock);
/*
- * Ensure the atomic_inc of reorder_objects above is ordered correctly
+ * Ensure the addition to the reorder list is ordered correctly
* with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
* in padata_reorder.
*/
smp_mb__after_atomic();
- put_cpu();
-
padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -404,9 +386,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
- pqueue->pd = pd;
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->cpu_index = cpu_index;
cpu_index++;
@@ -440,12 +427,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
+ atomic_set(&pd->refcnt, 1);
pd->pinst = pinst;
spin_lock_init(&pd->lock);
+ pd->cpu = cpumask_first(pd->cpumask.pcpu);
+ INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -468,31 +456,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- del_timer_sync(&pd->timer);
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -506,10 +469,6 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
@@ -530,8 +489,8 @@ static void padata_replace(struct padata_instance *pinst,
if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
notification_mask |= PADATA_CPU_SERIAL;
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ if (atomic_dec_and_test(&pd_old->refcnt))
+ padata_free_pd(pd_old);
if (notification_mask)
blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
@@ -661,8 +620,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -680,8 +639,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4335e7d1c391..500ba8b970e4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -322,3 +322,7 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM
bool
+
+config DEDUCE_WAKEUP_REASONS
+ bool
+ default n
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc177142a08f..372de061dda2 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -37,9 +37,6 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
-#ifdef CONFIG_PM_SLEEP
- char suspend_abort[MAX_SUSPEND_ABORT_LEN];
-#endif
do_gettimeofday(&start);
@@ -69,11 +66,6 @@ static int try_to_freeze_tasks(bool user_only)
break;
if (pm_wakeup_pending()) {
-#ifdef CONFIG_PM_SLEEP
- pm_get_active_wakeup_sources(suspend_abort,
- MAX_SUSPEND_ABORT_LEN);
- log_suspend_abort_reason(suspend_abort);
-#endif
wakeup = true;
break;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 58209d8bfc56..6e7832ee6d74 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -287,6 +287,7 @@ static int suspend_prepare(suspend_state_t state)
if (!error)
return 0;
+ log_suspend_abort_reason("One or more tasks refusing to freeze");
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
Finish:
@@ -316,7 +317,6 @@ void __weak arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- char suspend_abort[MAX_SUSPEND_ABORT_LEN];
int error, last_dev;
error = platform_suspend_prepare(state);
@@ -385,11 +385,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
state, false);
events_check_enabled = false;
} else if (*wakeup) {
- pm_get_active_wakeup_sources(suspend_abort,
- MAX_SUSPEND_ABORT_LEN);
- log_suspend_abort_reason(suspend_abort);
error = -EBUSY;
}
+
+ start_logging_wakeup_reasons();
syscore_resume();
}
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
index 252611fad2fe..44d8da2952c5 100644
--- a/kernel/power/wakeup_reason.c
+++ b/kernel/power/wakeup_reason.c
@@ -26,42 +26,232 @@
#include <linux/spinlock.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
-
+#include <linux/slab.h>
#define MAX_WAKEUP_REASON_IRQS 32
-static int irq_list[MAX_WAKEUP_REASON_IRQS];
-static int irqcount;
static bool suspend_abort;
static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+
+static struct wakeup_irq_node *base_irq_nodes;
+static struct wakeup_irq_node *cur_irq_tree;
+static int cur_irq_tree_depth;
+static LIST_HEAD(wakeup_irqs);
+
+static struct kmem_cache *wakeup_irq_nodes_cache;
static struct kobject *wakeup_reason;
-static DEFINE_SPINLOCK(resume_reason_lock);
+static spinlock_t resume_reason_lock;
+bool log_wakeups __read_mostly;
+struct completion wakeups_completion;
static ktime_t last_monotime; /* monotonic time before last suspend */
static ktime_t curr_monotime; /* monotonic time after last suspend */
static ktime_t last_stime; /* monotonic boottime offset before last suspend */
static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
-static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
+static void init_wakeup_irq_node(struct wakeup_irq_node *p, int irq)
{
- int irq_no, buf_offset = 0;
- struct irq_desc *desc;
- spin_lock(&resume_reason_lock);
- if (suspend_abort) {
- buf_offset = sprintf(buf, "Abort: %s", abort_reason);
- } else {
- for (irq_no = 0; irq_no < irqcount; irq_no++) {
- desc = irq_to_desc(irq_list[irq_no]);
- if (desc && desc->action && desc->action->name)
- buf_offset += sprintf(buf + buf_offset, "%d %s\n",
- irq_list[irq_no], desc->action->name);
- else
- buf_offset += sprintf(buf + buf_offset, "%d\n",
- irq_list[irq_no]);
+ p->irq = irq;
+ p->desc = irq_to_desc(irq);
+ p->child = NULL;
+ p->parent = NULL;
+ p->handled = false;
+ INIT_LIST_HEAD(&p->siblings);
+ INIT_LIST_HEAD(&p->next);
+}
+
+static struct wakeup_irq_node* alloc_irq_node(int irq)
+{
+ struct wakeup_irq_node *n;
+
+ n = kmem_cache_alloc(wakeup_irq_nodes_cache, GFP_ATOMIC);
+ if (!n) {
+ pr_warning("Failed to log chained wakeup IRQ %d\n",
+ irq);
+ return NULL;
+ }
+
+ init_wakeup_irq_node(n, irq);
+ return n;
+}
+
+static struct wakeup_irq_node *
+search_siblings(struct wakeup_irq_node *root, int irq)
+{
+ bool found = false;
+ struct wakeup_irq_node *n = NULL;
+ BUG_ON(!root);
+
+ if (root->irq == irq)
+ return root;
+
+ list_for_each_entry(n, &root->siblings, siblings) {
+ if (n->irq == irq) {
+ found = true;
+ break;
}
}
- spin_unlock(&resume_reason_lock);
- return buf_offset;
+
+ return found ? n : NULL;
+}
+
+static struct wakeup_irq_node *
+add_to_siblings(struct wakeup_irq_node *root, int irq)
+{
+ struct wakeup_irq_node *n;
+ if (root) {
+ n = search_siblings(root, irq);
+ if (n)
+ return n;
+ }
+ n = alloc_irq_node(irq);
+
+ if (n && root)
+ list_add(&n->siblings, &root->siblings);
+ return n;
+}
+
+#ifdef CONFIG_DEDUCE_WAKEUP_REASONS
+static struct wakeup_irq_node* add_child(struct wakeup_irq_node *root, int irq)
+{
+ if (!root->child) {
+ root->child = alloc_irq_node(irq);
+ if (!root->child)
+ return NULL;
+ root->child->parent = root;
+ return root->child;
+ }
+
+ return add_to_siblings(root->child, irq);
+}
+
+static struct wakeup_irq_node *find_first_sibling(struct wakeup_irq_node *node)
+{
+ struct wakeup_irq_node *n;
+ if (node->parent)
+ return node;
+ list_for_each_entry(n, &node->siblings, siblings) {
+ if (n->parent)
+ return n;
+ }
+ return NULL;
+}
+
+static struct wakeup_irq_node *
+get_base_node(struct wakeup_irq_node *node, unsigned depth)
+{
+ if (!node)
+ return NULL;
+
+ while (depth) {
+ node = find_first_sibling(node);
+ BUG_ON(!node);
+ node = node->parent;
+ depth--;
+ }
+
+ return node;
+}
+#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */
+
+static const struct list_head* get_wakeup_reasons_nosync(void);
+
+static void print_wakeup_sources(void)
+{
+ struct wakeup_irq_node *n;
+ const struct list_head *wakeups;
+
+ if (suspend_abort) {
+ pr_info("Abort: %s\n", abort_reason);
+ return;
+ }
+
+ wakeups = get_wakeup_reasons_nosync();
+ list_for_each_entry(n, wakeups, next) {
+ if (n->desc && n->desc->action && n->desc->action->name)
+ pr_info("Resume caused by IRQ %d, %s\n", n->irq,
+ n->desc->action->name);
+ else
+ pr_info("Resume caused by IRQ %d\n", n->irq);
+ }
+}
+
+static bool walk_irq_node_tree(struct wakeup_irq_node *root,
+ bool (*visit)(struct wakeup_irq_node *, void *),
+ void *cookie)
+{
+ struct wakeup_irq_node *n, *t;
+
+ if (!root)
+ return true;
+
+ list_for_each_entry_safe(n, t, &root->siblings, siblings) {
+ if (!walk_irq_node_tree(n->child, visit, cookie))
+ return false;
+ if (!visit(n, cookie))
+ return false;
+ }
+
+ if (!walk_irq_node_tree(root->child, visit, cookie))
+ return false;
+ return visit(root, cookie);
+}
+
+#ifdef CONFIG_DEDUCE_WAKEUP_REASONS
+static bool is_node_handled(struct wakeup_irq_node *n, void *_p)
+{
+ return n->handled;
+}
+
+static bool base_irq_nodes_done(void)
+{
+ return walk_irq_node_tree(base_irq_nodes, is_node_handled, NULL);
+}
+#endif
+
+struct buf_cookie {
+ char *buf;
+ int buf_offset;
+};
+
+static bool print_leaf_node(struct wakeup_irq_node *n, void *_p)
+{
+ struct buf_cookie *b = _p;
+ if (!n->child) {
+ if (n->desc && n->desc->action && n->desc->action->name)
+ b->buf_offset +=
+ snprintf(b->buf + b->buf_offset,
+ PAGE_SIZE - b->buf_offset,
+ "%d %s\n",
+ n->irq, n->desc->action->name);
+ else
+ b->buf_offset +=
+ snprintf(b->buf + b->buf_offset,
+ PAGE_SIZE - b->buf_offset,
+ "%d\n",
+ n->irq);
+ }
+ return true;
+}
+
+static ssize_t last_resume_reason_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned long flags;
+
+ struct buf_cookie b = {
+ .buf = buf,
+ .buf_offset = 0
+ };
+
+ spin_lock_irqsave(&resume_reason_lock, flags);
+ if (suspend_abort)
+ b.buf_offset = snprintf(buf, PAGE_SIZE, "Abort: %s", abort_reason);
+ else
+ walk_irq_node_tree(base_irq_nodes, print_leaf_node, &b);
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
+
+ return b.buf_offset;
}
static ssize_t last_suspend_time_show(struct kobject *kobj,
@@ -104,56 +294,155 @@ static struct attribute_group attr_group = {
.attrs = attrs,
};
+static inline void stop_logging_wakeup_reasons(void)
+{
+ ACCESS_ONCE(log_wakeups) = false;
+ smp_wmb();
+}
+
/*
- * logs all the wake up reasons to the kernel
- * stores the irqs to expose them to the userspace via sysfs
+ * stores the immediate wakeup irqs; these often aren't the ones seen by
+ * the drivers that registered them, due to chained interrupt controllers,
+ * and multiple-interrupt dispatch.
*/
-void log_wakeup_reason(int irq)
+void log_base_wakeup_reason(int irq)
{
- struct irq_desc *desc;
- desc = irq_to_desc(irq);
- if (desc && desc->action && desc->action->name)
- printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
- desc->action->name);
- else
- printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+ /* No locking is needed, since this function is called within
+ * syscore_resume, with both nonboot CPUs and interrupts disabled.
+ */
+ base_irq_nodes = add_to_siblings(base_irq_nodes, irq);
+ BUG_ON(!base_irq_nodes);
+#ifndef CONFIG_DEDUCE_WAKEUP_REASONS
+ base_irq_nodes->handled = true;
+#endif
+}
- spin_lock(&resume_reason_lock);
- if (irqcount == MAX_WAKEUP_REASON_IRQS) {
- spin_unlock(&resume_reason_lock);
- printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
- MAX_WAKEUP_REASON_IRQS);
- return;
+#ifdef CONFIG_DEDUCE_WAKEUP_REASONS
+
+/* This function is called by generic_handle_irq, which may call itself
+ * recursively. This happens with interrupts disabled. Using
+ * log_possible_wakeup_reason, we build a tree of interrupts, tracing the call
+ * stack of generic_handle_irq, for each wakeup source containing the
+ * interrupts actually handled.
+ *
+ * Most of these "trees" would either have a single node (in the event that the
+ * wakeup source is the final interrupt), or consist of a list of two
+ * interrupts, with the wakeup source at the root, and the final dispatched
+ * interrupt at the leaf.
+ *
+ * When *all* wakeup sources have been thusly spoken for, this function will
+ * clear the log_wakeups flag, and print the wakeup reasons.
+
+ TODO: percpu
+
+ */
+
+static struct wakeup_irq_node *
+log_possible_wakeup_reason_start(int irq, struct irq_desc *desc, unsigned depth)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON((signed)depth < 0);
+
+ /* This function can race with a call to stop_logging_wakeup_reasons()
+ * from a thread context. If this happens, just exit silently, as we are no
+ * longer interested in logging interrupts.
+ */
+ if (!logging_wakeup_reasons())
+ return NULL;
+
+ /* If suspend was aborted, the base IRQ nodes are missing, and we stop
+ * logging interrupts immediately.
+ */
+ if (!base_irq_nodes) {
+ stop_logging_wakeup_reasons();
+ return NULL;
+ }
+
+ /* We assume wakeup interrupts are handlerd only by the first core. */
+ /* TODO: relax this by having percpu versions of the irq tree */
+ if (smp_processor_id() != 0) {
+ return NULL;
}
- irq_list[irqcount++] = irq;
- spin_unlock(&resume_reason_lock);
+ if (depth == 0) {
+ cur_irq_tree_depth = 0;
+ cur_irq_tree = search_siblings(base_irq_nodes, irq);
+ }
+ else if (cur_irq_tree) {
+ if (depth > cur_irq_tree_depth) {
+ BUG_ON(depth - cur_irq_tree_depth > 1);
+ cur_irq_tree = add_child(cur_irq_tree, irq);
+ if (cur_irq_tree)
+ cur_irq_tree_depth++;
+ }
+ else {
+ cur_irq_tree = get_base_node(cur_irq_tree,
+ cur_irq_tree_depth - depth);
+ cur_irq_tree_depth = depth;
+ cur_irq_tree = add_to_siblings(cur_irq_tree, irq);
+ }
+ }
+
+ return cur_irq_tree;
}
-int check_wakeup_reason(int irq)
+static void log_possible_wakeup_reason_complete(struct wakeup_irq_node *n,
+ unsigned depth,
+ bool handled)
{
- int irq_no;
- int ret = false;
-
- spin_lock(&resume_reason_lock);
- for (irq_no = 0; irq_no < irqcount; irq_no++)
- if (irq_list[irq_no] == irq) {
- ret = true;
- break;
+ if (!n)
+ return;
+ n->handled = handled;
+ if (depth == 0) {
+ if (base_irq_nodes_done()) {
+ stop_logging_wakeup_reasons();
+ complete(&wakeups_completion);
+ print_wakeup_sources();
+ }
}
- spin_unlock(&resume_reason_lock);
- return ret;
}
+bool log_possible_wakeup_reason(int irq,
+ struct irq_desc *desc,
+ bool (*handler)(struct irq_desc *))
+{
+ static DEFINE_PER_CPU(unsigned int, depth);
+
+ struct wakeup_irq_node *n;
+ bool handled;
+ unsigned d;
+
+ d = get_cpu_var(depth)++;
+ put_cpu_var(depth);
+
+ n = log_possible_wakeup_reason_start(irq, desc, d);
+
+ handled = handler(desc);
+
+ d = --get_cpu_var(depth);
+ put_cpu_var(depth);
+
+ if (!handled && desc && desc->action)
+ pr_debug("%s: irq %d action %pF not handled\n", __func__,
+ irq, desc->action->handler);
+
+ log_possible_wakeup_reason_complete(n, d, handled);
+
+ return handled;
+}
+
+#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */
+
void log_suspend_abort_reason(const char *fmt, ...)
{
va_list args;
+ unsigned long flags;
- spin_lock(&resume_reason_lock);
+ spin_lock_irqsave(&resume_reason_lock, flags);
//Suspend abort reason has already been logged.
if (suspend_abort) {
- spin_unlock(&resume_reason_lock);
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
return;
}
@@ -161,29 +450,128 @@ void log_suspend_abort_reason(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
va_end(args);
- spin_unlock(&resume_reason_lock);
+
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
+}
+
+static bool match_node(struct wakeup_irq_node *n, void *_p)
+{
+ int irq = *((int *)_p);
+ return n->irq != irq;
+}
+
+int check_wakeup_reason(int irq)
+{
+ bool found;
+ unsigned long flags;
+ spin_lock_irqsave(&resume_reason_lock, flags);
+ found = !walk_irq_node_tree(base_irq_nodes, match_node, &irq);
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
+ return found;
+}
+
+static bool build_leaf_nodes(struct wakeup_irq_node *n, void *_p)
+{
+ struct list_head *wakeups = _p;
+ if (!n->child)
+ list_add(&n->next, wakeups);
+ return true;
+}
+
+static const struct list_head* get_wakeup_reasons_nosync(void)
+{
+ BUG_ON(logging_wakeup_reasons());
+ INIT_LIST_HEAD(&wakeup_irqs);
+ walk_irq_node_tree(base_irq_nodes, build_leaf_nodes, &wakeup_irqs);
+ return &wakeup_irqs;
+}
+
+static bool build_unfinished_nodes(struct wakeup_irq_node *n, void *_p)
+{
+ struct list_head *unfinished = _p;
+ if (!n->handled) {
+ pr_warning("%s: wakeup irq %d was not handled\n",
+ __func__, n->irq);
+ list_add(&n->next, unfinished);
+ }
+ return true;
+}
+
+const struct list_head* get_wakeup_reasons(unsigned long timeout,
+ struct list_head *unfinished)
+{
+ INIT_LIST_HEAD(unfinished);
+
+ if (logging_wakeup_reasons()) {
+ unsigned long signalled = 0;
+ if (timeout)
+ signalled = wait_for_completion_timeout(&wakeups_completion, timeout);
+ if (WARN_ON(!signalled)) {
+ stop_logging_wakeup_reasons();
+ walk_irq_node_tree(base_irq_nodes, build_unfinished_nodes, unfinished);
+ return NULL;
+ }
+ pr_info("%s: waited for %u ms\n",
+ __func__,
+ jiffies_to_msecs(timeout - signalled));
+ }
+
+ return get_wakeup_reasons_nosync();
+}
+
+static bool delete_node(struct wakeup_irq_node *n, void *unused)
+{
+ list_del(&n->siblings);
+ kmem_cache_free(wakeup_irq_nodes_cache, n);
+ return true;
+}
+
+void clear_wakeup_reasons(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&resume_reason_lock, flags);
+
+ BUG_ON(logging_wakeup_reasons());
+ walk_irq_node_tree(base_irq_nodes, delete_node, NULL);
+ base_irq_nodes = NULL;
+ cur_irq_tree = NULL;
+ cur_irq_tree_depth = 0;
+ INIT_LIST_HEAD(&wakeup_irqs);
+ suspend_abort = false;
+
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
}
/* Detects a suspend and clears all the previous wake up reasons*/
static int wakeup_reason_pm_event(struct notifier_block *notifier,
unsigned long pm_event, void *unused)
{
+ unsigned long flags;
switch (pm_event) {
case PM_SUSPEND_PREPARE:
- spin_lock(&resume_reason_lock);
- irqcount = 0;
+ spin_lock_irqsave(&resume_reason_lock, flags);
suspend_abort = false;
- spin_unlock(&resume_reason_lock);
+ spin_unlock_irqrestore(&resume_reason_lock, flags);
/* monotonic time since boot */
last_monotime = ktime_get();
/* monotonic time since boot including the time spent in suspend */
last_stime = ktime_get_boottime();
+ clear_wakeup_reasons();
break;
case PM_POST_SUSPEND:
/* monotonic time since boot */
curr_monotime = ktime_get();
/* monotonic time since boot including the time spent in suspend */
curr_stime = ktime_get_boottime();
+#ifdef CONFIG_DEDUCE_WAKEUP_REASONS
+ /* log_wakeups should have been cleared by now. */
+ if (WARN_ON(logging_wakeup_reasons())) {
+ stop_logging_wakeup_reasons();
+ print_wakeup_sources();
+ }
+#else
+ print_wakeup_sources();
+#endif
break;
default:
break;
@@ -195,31 +583,46 @@ static struct notifier_block wakeup_reason_pm_notifier_block = {
.notifier_call = wakeup_reason_pm_event,
};
-/* Initializes the sysfs parameter
- * registers the pm_event notifier
- */
int __init wakeup_reason_init(void)
{
- int retval;
+ spin_lock_init(&resume_reason_lock);
- retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
- if (retval)
- printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
- __func__, retval);
+ if (register_pm_notifier(&wakeup_reason_pm_notifier_block)) {
+ pr_warning("[%s] failed to register PM notifier\n",
+ __func__);
+ goto fail;
+ }
wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
if (!wakeup_reason) {
- printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ pr_warning("[%s] failed to create a sysfs kobject\n",
__func__);
- return 1;
+ goto fail_unregister_pm_notifier;
}
- retval = sysfs_create_group(wakeup_reason, &attr_group);
- if (retval) {
- kobject_put(wakeup_reason);
- printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
- __func__, retval);
+
+ if (sysfs_create_group(wakeup_reason, &attr_group)) {
+ pr_warning("[%s] failed to create a sysfs group\n",
+ __func__);
+ goto fail_kobject_put;
}
+
+ wakeup_irq_nodes_cache =
+ kmem_cache_create("wakeup_irq_node_cache",
+ sizeof(struct wakeup_irq_node), 0,
+ 0, NULL);
+ if (!wakeup_irq_nodes_cache)
+ goto fail_remove_group;
+
return 0;
+
+fail_remove_group:
+ sysfs_remove_group(wakeup_reason, &attr_group);
+fail_kobject_put:
+ kobject_put(wakeup_reason);
+fail_unregister_pm_notifier:
+ unregister_pm_notifier(&wakeup_reason_pm_notifier_block);
+fail:
+ return 1;
}
late_initcall(wakeup_reason_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd63b4d06139..4bbafc2a4822 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -497,7 +497,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -525,7 +525,6 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f6f8bb2f0d95..72e1ffe809f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2132,6 +2132,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
raw_spin_unlock(&rq->lock);
rcu_read_lock();
@@ -2225,6 +2226,7 @@ static void try_to_wake_up_local(struct task_struct *p)
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
note_task_waking(p, wallclock);
}
@@ -3196,6 +3198,8 @@ void scheduler_tick(void)
calc_global_load_tick(rq);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+
+ cpufreq_update_util(rq, 0);
early_notif = early_detection_notify(rq, wallclock);
raw_spin_unlock(&rq->lock);
@@ -3564,6 +3568,7 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+ cpufreq_update_util(rq, 0);
if (!is_idle_task(prev) && !prev->on_rq)
update_avg_burst(prev);
@@ -3582,6 +3587,7 @@ static void __sched notrace __schedule(bool preempt)
cpu = cpu_of(rq);
} else {
update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
}
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index ce15ae7fe76b..99f16128cf49 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -22,6 +22,7 @@
#include <linux/sched/rt.h>
#include <trace/events/sched.h>
+#include "sched.h"
#define MAX_CPUS_PER_CLUSTER 4
#define MAX_CLUSTERS 2
@@ -575,7 +576,8 @@ static bool eval_need(struct cluster_data *cluster)
cluster->active_cpus = get_active_cpu_count(cluster);
thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0;
list_for_each_entry(c, &cluster->lru, sib) {
- if (c->busy >= cluster->busy_up_thres[thres_idx])
+ if (c->busy >= cluster->busy_up_thres[thres_idx] ||
+ sched_cpu_high_irqload(c->cpu))
c->is_busy = true;
else if (c->busy < cluster->busy_down_thres[thres_idx])
c->is_busy = false;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2e6dd2c665..3f2b5c04623c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3001,6 +3001,8 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
int i;
struct cpumask search_cpus;
+ extern int num_clusters;
+
while (!bitmap_empty(env->backup_list, num_clusters)) {
next = next_candidate(env->backup_list, 0, num_clusters);
__clear_bit(next->id, env->backup_list);
@@ -3024,6 +3026,8 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
{
struct sched_cluster *next = NULL;
+ extern int num_clusters;
+
__clear_bit(cluster->id, env->candidate_list);
if (env->rtg && preferred_cluster(cluster, env->p))
@@ -5673,20 +5677,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
@@ -10536,7 +10548,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -10550,6 +10561,12 @@ more_balance:
}
/*
+ * Set loop_max when rq's lock is taken to prevent a race.
+ */
+ env.loop_max = min(sysctl_sched_nr_migrate,
+ busiest->nr_running);
+
+ /*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 5337ac7fcba1..598656b42203 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -3217,6 +3217,13 @@ void sched_get_cpus_busy(struct sched_load *busy,
update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
0);
+ /*
+ * Ensure that we don't report load for 'cpu' again via the
+ * cpufreq_update_util path in the window that started at
+ * rq->window_start
+ */
+ rq->load_reported_window = rq->window_start;
+
account_load_subtractions(rq);
load[i] = rq->prev_runnable_sum;
nload[i] = rq->nt_prev_runnable_sum;
@@ -3649,6 +3656,11 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
migrate_top_tasks(p, src_rq, dest_rq);
+ if (!same_freq_domain(new_cpu, task_cpu(p))) {
+ cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ }
+
if (p == src_rq->ed_task) {
src_rq->ed_task = NULL;
if (!dest_rq->ed_task)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 07a3cd3c6fbc..90cc450dff7e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -430,7 +430,6 @@ struct related_thread_group {
};
extern struct list_head cluster_head;
-extern int num_clusters;
extern struct sched_cluster *sched_cluster[NR_CPUS];
struct cpu_cycle {
@@ -441,6 +440,7 @@ struct cpu_cycle {
#define for_each_sched_cluster(cluster) \
list_for_each_entry_rcu(cluster, &cluster_head, list)
+extern unsigned int sched_disable_window_stats;
#endif /* CONFIG_SCHED_HMP */
/* CFS-related fields in a runqueue */
@@ -793,6 +793,7 @@ struct rq {
int cstate, wakeup_latency, wakeup_energy;
u64 window_start;
+ u64 load_reported_window;
unsigned long hmp_flags;
u64 cur_irqload;
@@ -1853,7 +1854,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@@ -2852,6 +2853,18 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Skip if we've already reported, but not if this is an inter-cluster
+ * migration
+ */
+ if (!sched_disable_window_stats &&
+ (rq->load_reported_window == rq->window_start) &&
+ !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG))
+ return;
+ rq->load_reported_window = rq->window_start;
+#endif
+
data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
if (data)
data->func(data, rq_clock(rq), flags);
diff --git a/kernel/signal.c b/kernel/signal.c
index 3095b2309876..6aa9ca45ebb1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -79,6 +79,11 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
+ /* Only allow kernel generated signals to this kthread */
+ if (unlikely((t->flags & PF_KTHREAD) &&
+ (handler == SIG_KTHREAD_KERNEL) && !force))
+ return true;
+
return sig_handler_ignored(handler, sig);
}
@@ -368,27 +373,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
{
struct sigqueue *q = NULL;
struct user_struct *user;
+ int sigpending;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
+ *
+ * NOTE! A pending signal will hold on to the user refcount,
+ * and we get/put the refcount only when the sigpending count
+ * changes from/to zero.
*/
rcu_read_lock();
- user = get_uid(__task_cred(t)->user);
- atomic_inc(&user->sigpending);
+ user = __task_cred(t)->user;
+ sigpending = atomic_inc_return(&user->sigpending);
+ if (sigpending == 1)
+ get_uid(user);
rcu_read_unlock();
- if (override_rlimit ||
- atomic_read(&user->sigpending) <=
- task_rlimit(t, RLIMIT_SIGPENDING)) {
+ if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
- atomic_dec(&user->sigpending);
- free_uid(user);
+ if (atomic_dec_and_test(&user->sigpending))
+ free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
@@ -402,8 +412,8 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- atomic_dec(&q->user->sigpending);
- free_uid(q->user);
+ if (atomic_dec_and_test(&q->user->sigpending))
+ free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
}
@@ -1650,7 +1660,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
* This is only possible if parent == real_parent.
* Check if it has changed security domain.
*/
- if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
sig = SIGCHLD;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index b2ec21c5c9d6..3f300d3fd1f6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/suspend.h>
#include "smpboot.h"
@@ -766,7 +767,8 @@ void wake_up_all_idle_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == smp_processor_id())
continue;
- if (!cpu_isolated(cpu))
+ if (suspend_freeze_state == FREEZE_STATE_ENTER ||
+ !cpu_isolated(cpu))
wake_up_if_idle(cpu);
}
preempt_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b25717eb3d3a..ac0569f7d56a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -98,6 +98,10 @@
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
+#ifdef CONFIG_USB
+int deny_new_usb __read_mostly = 0;
+EXPORT_SYMBOL(deny_new_usb);
+#endif
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
extern int core_uses_pid;
@@ -1083,6 +1087,17 @@ static struct ctl_table kern_table[] = {
.extra2 = &two,
},
#endif
+#ifdef CONFIG_USB
+ {
+ .procname = "deny_new_usb",
+ .data = &deny_new_usb,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "ngroups_max",
.data = &ngroups_max,
@@ -1640,7 +1655,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = &one,
.extra2 = &four,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11cc757795cd..8acdd6ae532c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -586,25 +586,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 89cc82a38e4d..3b43159ab41f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -272,8 +272,15 @@ static void clocksource_watchdog(unsigned long data)
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index e24008c098c6..45a0a26023d4 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -25,8 +25,6 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -168,7 +166,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -184,7 +182,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -206,38 +204,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d176c127f744..0a4f0c09bcbf 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1253,7 +1253,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
unsigned long long now = 0;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
- cpu_timer_sample_group(clock_idx, tsk, &now);
+ if (cpu_timer_sample_group(clock_idx, tsk, &now))
+ return;
if (oldval) {
/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3c7b7a9bcad1..d8b00f94c63d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -136,6 +136,20 @@ config GENERIC_TRACER
bool
select TRACING
+if TRACING
+
+config DISABLE_TRACE_PRINTK
+ bool "Force disable trace_printk() usage"
+ default y
+ help
+ When trace_printk() is used in any of the kernel source, it enables
+ debugging functions which are not desired for production kernel.
+ Enabling this option will replace trace_printk() with pr_debug().
+
+ If in doubt, say Y.
+
+endif
+
#
# Minimum requirements an architecture has to meet for us to
# be able to offer generic tracing facilities:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c39fc68c4778..54be8790941e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -319,11 +319,12 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt)
{
+ synchronize_rcu();
blk_trace_free(bt);
put_probe_ref();
}
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
@@ -336,6 +337,17 @@ int blk_trace_remove(struct request_queue *q)
return 0;
}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_remove(q);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_remove);
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -546,9 +558,8 @@ err:
return ret;
}
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -562,11 +573,24 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_setup(q, name, dev, bdev, arg);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -595,7 +619,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
@@ -603,11 +627,13 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
}
#endif
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -642,8 +668,25 @@ int blk_trace_startstop(struct request_queue *q, int start)
return ret;
}
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_startstop(q, start);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
+
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
@@ -661,12 +704,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -677,17 +720,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
- ret = blk_trace_startstop(q, start);
+ ret = __blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = blk_trace_remove(q);
+ ret = __blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
return ret;
}
@@ -698,10 +741,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- if (q->blk_trace) {
- blk_trace_startstop(q, 0);
- blk_trace_remove(q);
+ mutex_lock(&q->blk_trace_mutex);
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ __blk_trace_startstop(q, 0);
+ __blk_trace_remove(q);
}
+
+ mutex_unlock(&q->blk_trace_mutex);
}
/*
@@ -722,11 +769,15 @@ void blk_trace_shutdown(struct request_queue *q)
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct task_struct *tsk = current;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
/*
* Use the bio context for all events except ISSUE and
@@ -751,6 +802,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
rq->cmd_flags, what, rq->errors, 0, NULL, tsk);
}
+ rcu_read_unlock();
}
static void blk_add_trace_rq_abort(void *ignore,
@@ -800,11 +852,15 @@ static void blk_add_trace_rq_complete(void *ignore,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct task_struct *tsk = current;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
/*
* Not all the pages in the bio are dirtied by the same task but
@@ -817,6 +873,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL, tsk);
+ rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -861,11 +918,14 @@ static void blk_add_trace_getrq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0,
NULL, current);
+ rcu_read_unlock();
}
}
@@ -877,28 +937,36 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
0, 0, NULL, current);
+ rcu_read_unlock();
}
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL,
current);
+ rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
@@ -911,15 +979,18 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu,
current);
}
+ rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct task_struct *tsk = current;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
@@ -932,6 +1003,7 @@ static void blk_add_trace_split(void *ignore,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
bio->bi_error, sizeof(rpdu), &rpdu, tsk);
}
+ rcu_read_unlock();
}
/**
@@ -951,12 +1023,16 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
struct task_struct *tsk = current;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
@@ -970,6 +1046,7 @@ static void blk_add_trace_bio_remap(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r, tsk);
+ rcu_read_unlock();
}
/**
@@ -990,12 +1067,16 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev,
sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
struct task_struct *tsk = current;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -1009,6 +1090,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r, tsk);
+ rcu_read_unlock();
}
/**
@@ -1026,11 +1108,15 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq,
void *data, size_t len)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct task_struct *tsk = current;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
if (bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
rq->bio->bi_io_vec->bv_page &&
@@ -1043,6 +1129,7 @@ void blk_add_driver_data(struct request_queue *q,
else
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, rq->errors, len, data, tsk);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1534,6 +1621,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
return -EINVAL;
put_probe_ref();
+ synchronize_rcu();
blk_trace_free(bt);
return 0;
}
@@ -1694,6 +1782,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q;
struct block_device *bdev;
+ struct blk_trace *bt;
ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p));
@@ -1704,26 +1793,28 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
- if (q->blk_trace == NULL)
+ if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
- ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
- ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+ ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1737,6 +1828,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev;
struct request_queue *q;
struct hd_struct *p;
+ struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1765,10 +1857,12 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- if (!!value == !!q->blk_trace) {
+ if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
@@ -1780,22 +1874,25 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
- if (q->blk_trace == NULL)
+ if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
+ }
if (ret == 0) {
if (attr == &dev_attr_act_mask)
- q->blk_trace->act_mask = value;
+ bt->act_mask = value;
else if (attr == &dev_attr_pid)
- q->blk_trace->pid = value;
+ bt->pid = value;
else if (attr == &dev_attr_start_lba)
- q->blk_trace->start_lba = value;
+ bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
- q->blk_trace->end_lba = value;
+ bt->end_lba = value;
}
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3dd40c736067..a71bdad638d5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -206,6 +206,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
event->pmu->count)
return -EINVAL;
+ if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+ event->attr.type != PERF_TYPE_RAW))
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6380ec0453e0..e4c6f89b6b11 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -637,8 +637,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -664,7 +663,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4509cf51fbfd..b0ebb271bae7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5910,12 +5910,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+ if (ref->ref > INT_MAX/2)
+ return false;
+
ref->ref++;
+ return true;
}
/* Pipe buffer operations for a buffer. */
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8be66a2b0cac..78346aba6980 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -121,9 +121,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
struct trace_event_file *event_file = event_file_data(m->private);
- if (t == SHOW_AVAILABLE_TRIGGERS)
+ if (t == SHOW_AVAILABLE_TRIGGERS) {
+ (*pos)++;
return NULL;
-
+ }
return seq_list_next(t, &event_file->triggers, pos);
}
@@ -909,14 +910,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = register_trigger(glob, ops, data, file);
-
- if (ret > 0 && tracing_alloc_snapshot() != 0) {
- unregister_trigger(glob, ops, data, file);
- ret = 0;
- }
+ if (tracing_alloc_snapshot() != 0)
+ return 0;
- return ret;
+ return register_trigger(glob, ops, data, file);
}
static int
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 78f04e4ad829..927fd4ad5846 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -626,7 +626,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -644,6 +644,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 202df6cffcca..f08ec7c6f9e0 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -197,6 +197,11 @@ check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 6cf935316769..a2081a7f0c2c 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -302,7 +302,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret;
+ int ret = -EINVAL;
if (!trace)
return -EINVAL;
@@ -313,17 +313,15 @@ int register_stat_tracer(struct tracer_stat *trace)
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
- if (node->ts == trace) {
- mutex_unlock(&all_stat_sessions_mutex);
- return -EINVAL;
- }
+ if (node->ts == trace)
+ goto out;
}
- mutex_unlock(&all_stat_sessions_mutex);
+ ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- return -ENOMEM;
+ goto out;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -332,15 +330,16 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- return ret;
+ goto out;
}
+ ret = 0;
/* Register */
- mutex_lock(&all_stat_sessions_mutex);
list_add_tail(&session->session_list, &all_stat_sessions);
+ out:
mutex_unlock(&all_stat_sessions_mutex);
- return 0;
+ return ret;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 89a0f1171f90..696f0913ba38 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2329,8 +2329,14 @@ repeat:
*/
if (need_to_create_worker(pool)) {
spin_lock(&wq_mayday_lock);
- get_pwq(pwq);
- list_move_tail(&pwq->mayday_node, &wq->maydays);
+ /*
+ * Queue iff we aren't racing destruction
+ * and somebody else hasn't queued it already.
+ */
+ if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
+ }
spin_unlock(&wq_mayday_lock);
}
}
@@ -2919,6 +2925,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
}
EXPORT_SYMBOL(flush_delayed_work);
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+ return __cancel_work(work, false);
+}
+
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
@@ -2937,20 +2968,7 @@ EXPORT_SYMBOL(flush_delayed_work);
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
- unsigned long flags;
- int ret;
-
- do {
- ret = try_to_grab_pending(&dwork->work, true, &flags);
- } while (unlikely(ret == -EAGAIN));
-
- if (unlikely(ret < 0))
- return false;
-
- set_work_pool_and_clear_pending(&dwork->work,
- get_work_pool_id(&dwork->work));
- local_irq_restore(flags);
- return ret;
+ return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);
@@ -3971,9 +3989,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct pool_workqueue *pwq;
int node;
+ /*
+ * Remove it from sysfs first so that sanity check failure doesn't
+ * lead to sysfs name conflicts.
+ */
+ workqueue_sysfs_unregister(wq);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+ if (wq->rescuer) {
+ struct worker *rescuer = wq->rescuer;
+
+ /* this prevents new queueing */
+ spin_lock_irq(&wq_mayday_lock);
+ wq->rescuer = NULL;
+ spin_unlock_irq(&wq_mayday_lock);
+
+ /* rescuer will empty maydays list before exiting */
+ kthread_stop(rescuer->task);
+ kfree(rescuer);
+ }
+
/* sanity checks */
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
@@ -4003,11 +4041,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- workqueue_sysfs_unregister(wq);
-
- if (wq->rescuer)
- kthread_stop(wq->rescuer->task);
-
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
@@ -4284,7 +4317,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ pr_cont(" active=%d/%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {