summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_watch.c8
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c10
-rw-r--r--kernel/bpf/syscall.c24
-rw-r--r--kernel/bpf/verifier.c78
-rw-r--r--kernel/capability.c46
-rw-r--r--kernel/cgroup.c58
-rw-r--r--kernel/configs/tiny.config8
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c97
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c29
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/futex.c32
-rw-r--r--kernel/irq/generic-chip.c21
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/msi.c20
-rw-r--r--kernel/jump_label.c36
-rw-r--r--kernel/kexec_file.c3
-rw-r--r--kernel/locking/mcs_spinlock.h8
-rw-r--r--kernel/locking/mutex.c9
-rw-r--r--kernel/locking/qspinlock.c60
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/module.c133
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/power/hibernate.c5
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/printk/printk.c35
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/sched/cputime.c29
-rw-r--r--kernel/sched/fair.c78
-rw-r--r--kernel/sched/loadavg.c11
-rw-r--r--kernel/sched/sched.h13
-rw-r--r--kernel/seccomp.c22
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sysctl.c59
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/hrtimer.c62
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/ntp.c20
-rw-r--r--kernel/time/posix-clock.c4
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/time/timekeeping.c34
-rw-r--r--kernel/time/timekeeping_debug.c9
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ring_buffer.c35
-rw-r--r--kernel/trace/trace.c36
-rw-r--r--kernel/trace/trace_events.c26
-rw-r--r--kernel/trace/trace_events_filter.c13
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_printk.c10
-rw-r--r--kernel/trace/trace_stack.c7
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/workqueue.c58
62 files changed, 1210 insertions, 403 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93ac0d..939945a5649c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- rcu_read_lock();
- exe_file = rcu_dereference(tsk->mm->exe_file);
+ exe_file = get_task_exe_file(tsk);
+ if (!exe_file)
+ return 0;
ino = exe_file->f_inode->i_ino;
dev = exe_file->f_inode->i_sb->s_dev;
- rcu_read_unlock();
+ fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4504ca66118d..50da680c479f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
if (!task)
return -EINVAL;
- memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
return 0;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d50b7..cb85d228b1ac 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ raw = bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- bpf_map_inc(raw, true);
+ raw = bpf_map_inc(raw, true);
break;
default:
WARN_ON_ONCE(1);
@@ -277,7 +277,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
goto out;
raw = bpf_any_get(inode->i_private, *type);
- touch_atime(&path);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
path_put(&path);
return raw;
@@ -357,7 +358,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
static struct dentry *bpf_mount(struct file_system_type *type, int flags,
const char *dev_name, void *data)
{
- return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+ return mount_nodev(type, flags, data, bpf_fill_super);
}
static struct file_system_type bpf_fs_type = {
@@ -365,7 +366,6 @@ static struct file_system_type bpf_fs_type = {
.name = "bpf",
.mount = bpf_mount,
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
};
MODULE_ALIAS_FS("bpf");
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550d8485..4e32cc94edd9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,11 +181,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
{
- atomic_inc(&map->refcnt);
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
if (uref)
atomic_inc(&map->usercnt);
+ return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -197,7 +204,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- bpf_map_inc(map, true);
+ map = bpf_map_inc(map, true);
fdput(f);
return map;
@@ -580,6 +587,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
@@ -592,7 +608,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
if (IS_ERR(prog))
return prog;
- atomic_inc(&prog->aux->refcnt);
+ prog = bpf_prog_inc(prog);
fdput(f);
return prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7945d10b378..2cbfba78d3db 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -239,15 +239,6 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
-static const struct {
- int map_type;
- int func_id;
-} func_limit[] = {
- {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
-};
-
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -898,24 +889,44 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
{
- bool bool_map, bool_func;
- int i;
-
if (!map)
return 0;
- for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
- bool_map = (map->map_type == func_limit[i].map_type);
- bool_func = (func_id == func_limit[i].func_id);
- /* only when map & func pair match it can continue.
- * don't allow any other map type to be passed into
- * the special func;
- */
- if (bool_func && bool_map != bool_func)
- return -EINVAL;
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ default:
+ break;
}
return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
}
static int check_call(struct verifier_env *env, int func_id)
@@ -1121,6 +1132,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose("invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
@@ -1338,6 +1359,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
verbose("BPF_LD_ABS uses reserved fields\n");
return -EINVAL;
@@ -1993,7 +2015,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
- fdput(f);
return PTR_ERR(map);
}
@@ -2013,15 +2034,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return -E2BIG;
}
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
-
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
- bpf_map_inc(map, false);
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
fdput(f);
next_insn:
insn++;
@@ -2072,7 +2096,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
/* adjust offset of jmps if necessary */
if (i < pos && i + insn->off + 1 > pos)
insn->off += delta;
- else if (i > pos && i + insn->off + 1 < pos)
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
insn->off -= delta;
}
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..00411c82dac5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+ int capable;
+
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ capable = audit ? security_capable(current_cred(), ns, cap) :
+ security_capable_noaudit(current_cred(), ns, cap);
+ if (capable == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
+
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- if (unlikely(!cap_valid(cap))) {
- pr_crit("capable() called with invalid cap=%u\n", cap);
- BUG();
- }
-
- if (security_capable(current_cred(), ns, cap) == 0) {
- current->flags |= PF_SUPERPRIV;
- return true;
- }
- return false;
+ return ns_capable_common(ns, cap, true);
}
EXPORT_SYMBOL(ns_capable);
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
/**
* capable - Determine if the current task has a superior capability in effect
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 00af24ac0167..e4552a3cbf41 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,7 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-
+#include <linux/cpuset.h>
#include <linux/atomic.h>
/*
@@ -2498,6 +2498,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
@@ -2714,9 +2722,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
+ struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
- int ret;
+ int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
@@ -2764,6 +2773,9 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ for_each_subsys(ss, ssid)
+ if (ss->post_attach)
+ ss->post_attach();
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -4681,14 +4693,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
- if (css->parent)
- css_put(css->parent);
-
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@@ -4781,9 +4794,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
+ css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
+ atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4806,6 +4821,10 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+ atomic_inc(&css->online_cnt);
+ if (css->parent)
+ atomic_inc(&css->parent->online_cnt);
}
return ret;
}
@@ -5037,10 +5056,15 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
- offline_css(css);
- mutex_unlock(&cgroup_mutex);
- css_put(css);
+ do {
+ offline_css(css);
+ css_put(css);
+ /* @css can't go away while we're holding cgroup_mutex */
+ css = css->parent;
+ } while (css && atomic_dec_and_test(&css->online_cnt));
+
+ mutex_unlock(&cgroup_mutex);
}
/* css kill confirmation processing requires process context, bounce */
@@ -5049,8 +5073,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ if (atomic_dec_and_test(&css->online_cnt)) {
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+ }
}
/**
@@ -5119,6 +5145,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
+ struct cgrp_cset_link *link;
int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -5139,11 +5166,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY;
/*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group().
+ * Mark @cgrp and the associated csets dead. The former prevents
+ * further task migration and child creation by disabling
+ * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ link->cset->dead = true;
+ spin_unlock_bh(&css_set_lock);
+
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
CONFIG_SLOB=y
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d0f77112eb3..3f9db31c5d04 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -57,7 +57,6 @@
#include <asm/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -287,6 +286,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -324,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -334,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -974,31 +986,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
/*
- * cpuset_migrate_mm
- *
- * Migrate memory region from one set of nodes to another.
- *
- * Temporarilly set tasks mems_allowed to target nodes of migration,
- * so that the migration code can allocate pages on these nodes.
- *
- * While the mm_struct we are migrating is typically from some
- * other task, the task_struct mems_allowed that we are hacking
- * is for our current task, which must allocate new pages for that
- * migrating memory region.
+ * Migrate memory region from one set of nodes to another. This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management. All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
*/
+struct cpuset_migrate_mm_work {
+ struct work_struct work;
+ struct mm_struct *mm;
+ nodemask_t from;
+ nodemask_t to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+ struct cpuset_migrate_mm_work *mwork =
+ container_of(work, struct cpuset_migrate_mm_work, work);
+
+ /* on a wq worker, no need to worry about %current's mems_allowed */
+ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+ mmput(mwork->mm);
+ kfree(mwork);
+}
+
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
- struct task_struct *tsk = current;
-
- tsk->mems_allowed = *to;
+ struct cpuset_migrate_mm_work *mwork;
- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+ if (mwork) {
+ mwork->mm = mm;
+ mwork->from = *from;
+ mwork->to = *to;
+ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+ queue_work(cpuset_migrate_mm_wq, &mwork->work);
+ } else {
+ mmput(mm);
+ }
+}
- rcu_read_lock();
- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
- rcu_read_unlock();
+static void cpuset_post_attach(void)
+{
+ flush_workqueue(cpuset_migrate_mm_wq);
}
/*
@@ -1099,7 +1131,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
- mmput(mm);
+ else
+ mmput(mm);
}
css_task_iter_end(&it);
@@ -1544,11 +1577,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs)) {
+ if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- }
- mmput(mm);
+ else
+ mmput(mm);
}
}
@@ -1713,6 +1746,7 @@ out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
+ flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
@@ -2061,6 +2095,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task, void *priv)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2069,7 +2117,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
+ .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = 1,
};
@@ -2368,6 +2418,9 @@ void __init cpuset_init_smp(void)
top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+ BUG_ON(!cpuset_migrate_mm_wq);
}
/**
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..ff8606f77d90 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2f07e9c34f43..541ea712ad98 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -951,6 +951,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event_context::lock
@@ -1543,12 +1544,33 @@ static int __init perf_workqueue_init(void)
core_initcall(perf_workqueue_init);
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
return pmu->filter_match ? pmu->filter_match(event) : 1;
}
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct perf_event *child;
+
+ if (!__pmu_filter_match(event))
+ return 0;
+
+ list_for_each_entry(child, &event->sibling_list, group_entry) {
+ if (!__pmu_filter_match(child))
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -1585,14 +1607,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -3423,7 +3445,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3437,16 +3458,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
@@ -7115,7 +7127,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
- bpf_prog_put(prog);
+ bpf_prog_put_rcu(prog);
}
}
@@ -7984,6 +7996,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
+
return event;
err_per_task:
@@ -8333,6 +8348,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -8340,7 +8373,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}
if (is_sampling_event(event)) {
@@ -8350,8 +8383,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- account_event(event);
-
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -8401,11 +8432,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8493,6 +8519,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8562,6 +8593,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);
+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();
event->owner = current;
@@ -8590,7 +8626,15 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -8634,8 +8678,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = EVENT_OWNER_KERNEL;
- account_event(event);
-
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -8872,6 +8914,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536117..014b69528194 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -347,6 +347,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
bool truncated)
{
struct ring_buffer *rb = handle->rb;
+ bool wakeup = truncated;
unsigned long aux_head;
u64 flags = 0;
@@ -375,9 +376,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
- perf_output_wakeup(handle);
+ wakeup = true;
local_add(rb->aux_watermark, &rb->aux_wakeup);
}
+
+ if (wakeup) {
+ if (truncated)
+ handle->event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ }
+
handle->event = NULL;
local_set(&rb->aux_nest, 0);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..da0c09ff6112 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -171,8 +171,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
@@ -199,7 +201,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 92ff63200287..62c4bd4abd3a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -923,17 +923,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
- /* Wait for all children (clone and not) if __WALL is set;
- * otherwise, wait for clone children *only* if __WCLONE is
- * set; otherwise, wait for non-clone children *only*. (Note:
- * A "clone" child here is one that reports to its parent
- * using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
- && !(wo->wo_flags & __WALL))
+
+ /*
+ * Wait for all children (clone and not) if __WALL is set or
+ * if it is traced by us.
+ */
+ if (ptrace || (wo->wo_flags & __WALL))
+ return 1;
+
+ /*
+ * Otherwise, wait for clone children *only* if __WCLONE is set;
+ * otherwise, wait for non-clone children *only*.
+ *
+ * Note: a "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD, or a non-leader thread which
+ * we can only see if it is traced by us.
+ */
+ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
@@ -1306,7 +1317,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (unlikely(exit_state == EXIT_DEAD))
return 0;
- ret = eligible_child(wo, p);
+ ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index d6a6da547e41..18a5cb17035a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -764,6 +764,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
EXPORT_SYMBOL(get_mm_exe_file);
/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+ struct file *exe_file = NULL;
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (!(task->flags & PF_KTHREAD))
+ exe_file = get_mm_exe_file(mm);
+ }
+ task_unlock(task);
+ return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
+/**
* get_task_mm - acquire a reference to the task's mm
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
@@ -879,14 +902,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
+ if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
diff --git a/kernel/futex.c b/kernel/futex.c
index 495a1d06915b..9d8163afd87c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1244,10 +1244,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
+ } else if (curval != uval) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
if (ret) {
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1474,8 +1484,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
hb_waiters_dec(hb1);
- plist_add(&q->list, &hb2->chain);
hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -2538,6 +2548,15 @@ retry:
if (ret == -EFAULT)
goto pi_faulted;
/*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN) {
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ goto retry;
+ }
+ /*
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
@@ -2755,6 +2774,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+ */
+ free_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
}
} else {
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..a4775f3451b9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
}
EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
+
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a302cf9a2126..57bff7857e87 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
unsigned int flags = 0, irq = desc->irq_data.irq;
struct irqaction *action = desc->action;
- do {
+ /* action might have become NULL since we dropped the lock */
+ while (action) {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
@@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval |= res;
action = action->next;
- } while (action);
+ }
add_interrupt_randomness(irq, flags);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6b0c0b74a2a1..cd6009006510 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = ops->msi_check(domain, info, dev);
if (ret == 0)
@@ -278,12 +278,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false);
if (virq < 0) {
ret = -ENOSPC;
@@ -302,11 +298,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->msi_finish(&arg, 0);
for_each_msi_entry(desc, dev) {
+ virq = desc->irq;
if (desc->nvec_used == 1)
dev_dbg(dev, "irq %d for MSI\n", virq);
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..4b353e0be121 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
+ int v, v1;
+
STATIC_KEY_CHECK_USE();
- if (atomic_inc_not_zero(&key->enabled))
- return;
+
+ /*
+ * Careful if we get concurrent static_key_slow_inc() calls;
+ * later calls must wait for the first one to _finish_ the
+ * jump_label_update() process. At the same time, however,
+ * the jump_label_update() call below wants to see
+ * static_key_enabled(&key) for jumps to be updated properly.
+ *
+ * So give a special meaning to negative key->enabled: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+ v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+ if (likely(v1 == v))
+ return;
+ }
jump_label_lock();
- if (atomic_inc_return(&key->enabled) == 1)
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
jump_label_update(key);
+ atomic_set(&key->enabled, 1);
+ } else {
+ atomic_inc(&key->enabled);
+ }
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
+ /*
+ * The negative count check is valid even when a negative
+ * key->enabled is in use by static_key_slow_inc(); a
+ * __static_key_slow_dec() before the first static_key_slow_inc()
+ * returns is unbalanced, because all other static_key_slow_inc()
+ * instances block while the update is in progress.
+ */
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..6030efd4a188 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
return 0;
out:
vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
return ret;
}
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5b9102a47ea5..c835270f0c2f 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
- prev = xchg_acquire(lock, node);
+ /*
+ * We rely on the full barrier with global transitivity implied by the
+ * below xchg() to order the initialization stores above against any
+ * observation of @node. And to provide the ACQUIRE ordering associated
+ * with a LOCK primitive.
+ */
+ prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0551c219c40e..89350f924c85 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
if (!hold_ctx)
return 0;
- if (unlikely(ctx == hold_ctx))
- return -EALREADY;
-
if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
(ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
#ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
unsigned long flags;
int ret;
+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+ }
+
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..8173bc7fec92 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -255,6 +255,66 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
+ *
+ * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
+ * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+ u32 val;
+
+ for (;;) {
+ val = atomic_read(&lock->val);
+
+ if (!val) /* not locked, we're done */
+ goto done;
+
+ if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+ break;
+
+ /* not locked, but pending, wait until we observe the lock */
+ cpu_relax();
+ }
+
+ /* any unlock is good */
+ while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+done:
+ smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..25ced161ebeb 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -111,7 +111,7 @@ EXPORT_SYMBOL(memunmap);
static void devm_memremap_release(struct device *dev, void *res)
{
- memunmap(res);
+ memunmap(*(void **)res);
}
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
@@ -133,8 +133,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
- } else
+ } else {
devres_free(ptr);
+ return ERR_PTR(-ENXIO);
+ }
return addr;
}
diff --git a/kernel/module.c b/kernel/module.c
index 38c7bd5583ff..b14a4f31221f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -327,6 +327,9 @@ struct load_info {
struct _ddebug *debug;
unsigned int num_debug;
bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -2492,10 +2495,21 @@ static void layout_symtab(struct module *mod, struct load_info *info)
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_size = debug_align(mod->init_size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod->init_size = ALIGN(mod->init_size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_size;
+ mod->init_size += sizeof(struct mod_kallsyms);
+ mod->init_size = debug_align(mod->init_size);
}
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
@@ -2504,28 +2518,33 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- mod->symtab = (void *)symsec->sh_addr;
- mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Set up to point into init section. */
+ mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off;
+
+ mod->kallsyms->symtab = (void *)symsec->sh_addr;
+ mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
/* Set types up while we still have access to sections. */
- for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
- mod->core_symtab = dst = mod->module_core + info->symoffs;
- mod->core_strtab = s = mod->module_core + info->stroffs;
- src = mod->symtab;
- for (ndst = i = 0; i < mod->num_symtab; i++) {
+ for (i = 0; i < mod->kallsyms->num_symtab; i++)
+ mod->kallsyms->symtab[i].st_info
+ = elf_type(&mod->kallsyms->symtab[i], info);
+
+ /* Now populate the cut down core kallsyms for after init. */
+ mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs;
+ src = mod->kallsyms->symtab;
+ for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src[i].st_name],
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
KSYM_NAME_LEN) + 1;
}
}
- mod->core_num_syms = ndst;
+ mod->core_kallsyms.num_symtab = ndst;
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2587,13 +2606,18 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
int err = -ENOKEY;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
const void *mod = info->hdr;
- if (info->len > markerlen &&
+ /*
+ * Require flags == 0, as a module with version information
+ * removed is no longer the module that was signed
+ */
+ if (flags == 0 &&
+ info->len > markerlen &&
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
@@ -2612,7 +2636,7 @@ static int module_sig_check(struct load_info *info)
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
return 0;
}
@@ -3274,9 +3298,8 @@ static noinline int do_init_module(struct module *mod)
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
mod_tree_remove_init(mod);
unset_module_init_ro_nx(mod);
@@ -3426,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err;
char *after_dashes;
- err = module_sig_check(info);
+ err = module_sig_check(info, flags);
if (err)
goto free_copy;
@@ -3515,7 +3538,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, NULL,
+ -32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
@@ -3646,6 +3669,11 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
static const char *get_ksymbol(struct module *mod,
unsigned long addr,
unsigned long *size,
@@ -3653,6 +3681,7 @@ static const char *get_ksymbol(struct module *mod,
{
unsigned int i, best = 0;
unsigned long nextval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
@@ -3662,32 +3691,32 @@ static const char *get_ksymbol(struct module *mod,
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
- for (i = 1; i < mod->num_symtab; i++) {
- if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
continue;
/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (mod->symtab[i].st_value <= addr
- && mod->symtab[i].st_value > mod->symtab[best].st_value
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ if (*symname(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(symname(kallsyms, i)))
+ continue;
+
+ if (kallsyms->symtab[i].st_value <= addr
+ && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
best = i;
- if (mod->symtab[i].st_value > addr
- && mod->symtab[i].st_value < nextval
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
- nextval = mod->symtab[i].st_value;
+ if (kallsyms->symtab[i].st_value > addr
+ && kallsyms->symtab[i].st_value < nextval)
+ nextval = kallsyms->symtab[i].st_value;
}
if (!best)
return NULL;
if (size)
- *size = nextval - mod->symtab[best].st_value;
+ *size = nextval - kallsyms->symtab[best].st_value;
if (offset)
- *offset = addr - mod->symtab[best].st_value;
- return mod->strtab + mod->symtab[best].st_name;
+ *offset = addr - kallsyms->symtab[best].st_value;
+ return symname(kallsyms, best);
}
/* For kallsyms to ask for address resolution. NULL means not found. Careful
@@ -3777,19 +3806,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (symnum < mod->num_symtab) {
- *value = mod->symtab[symnum].st_value;
- *type = mod->symtab[symnum].st_info;
- strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- KSYM_NAME_LEN);
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ *value = kallsyms->symtab[symnum].st_value;
+ *type = kallsyms->symtab[symnum].st_info;
+ strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
return 0;
}
- symnum -= mod->num_symtab;
+ symnum -= kallsyms->num_symtab;
}
preempt_enable();
return -ERANGE;
@@ -3798,11 +3829,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static unsigned long mod_find_symname(struct module *mod, const char *name)
{
unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
- for (i = 0; i < mod->num_symtab; i++)
- if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
- mod->symtab[i].st_info != 'U')
- return mod->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++)
+ if (strcmp(name, symname(kallsyms, i)) == 0 &&
+ kallsyms->symtab[i].st_info != 'U')
+ return kallsyms->symtab[i].st_value;
return 0;
}
@@ -3841,11 +3873,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
module_assert_mutex();
list_for_each_entry(mod, &modules, list) {
+ /* We hold module_mutex: no need for rcu_dereference_sched */
+ struct mod_kallsyms *kallsyms = mod->kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- for (i = 0; i < mod->num_symtab; i++) {
- ret = fn(data, mod->strtab + mod->symtab[i].st_name,
- mod, mod->symtab[i].st_value);
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ ret = fn(data, symname(kallsyms, i),
+ mod, kallsyms->symtab[i].st_value);
if (ret != 0)
return ret;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b150bc0c6c1..41e2b54f36b5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -157,8 +157,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_trylock();
- console_unlock();
+ console_flush_on_panic();
if (!panic_blink)
panic_blink = no_blink;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7342a24f559..3124cebaec31 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -299,12 +299,12 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
if (!in_suspend)
events_check_enabled = false;
@@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode)
pm_message_t msg;
int error;
+ pm_suspend_clear_flags();
error = platform_begin(platform_mode);
if (error)
goto Close;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..f155c62f1f2c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 28fb44dccbad..e7e586bb2022 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2241,13 +2241,24 @@ void console_unlock(void)
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
- bool retry;
+ bool do_cond_resched, retry;
if (console_suspended) {
up_console_sem();
return;
}
+ /*
+ * Console drivers are called under logbuf_lock, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system.
+ */
+ do_cond_resched = console_may_schedule;
console_may_schedule = 0;
/* flush buffered message fragment immediately to console */
@@ -2319,6 +2330,9 @@ skip:
call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
+
+ if (do_cond_resched)
+ cond_resched();
}
console_locked = 0;
@@ -2386,6 +2400,25 @@ void console_unblank(void)
console_unlock();
}
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+ /*
+ * If someone else is holding the console lock, trylock will fail
+ * and may_schedule may be set. Ignore and proceed to unlock so
+ * that messages are flushed out. As this can be called from any
+ * context and we don't want to get preempted while flushing,
+ * ensure may_schedule is cleared.
+ */
+ console_trylock();
+ console_may_schedule = 0;
+ console_unlock();
+}
+
/*
* Return the console tty driver structure and its associated index
*/
diff --git a/kernel/resource.c b/kernel/resource.c
index f150dbbe6f62..249b1eb1e6e1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent,
if (!conflict)
break;
if (conflict != parent) {
- parent = conflict;
- if (!(conflict->flags & IORESOURCE_BUSY))
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ parent = conflict;
continue;
+ }
}
if (conflict->flags & flags & IORESOURCE_MUXED) {
add_wait_queue(&muxed_resource_wait, &wait);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 471094bfbcc6..1df6da0094f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -640,7 +640,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -1963,6 +1966,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -5117,14 +5142,16 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
-
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
@@ -5759,6 +5786,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
walt_set_window_start(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
break;
case CPU_ONLINE:
@@ -7053,7 +7081,7 @@ static void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
- for (k = 0; k < nr_node_ids; k++) {
+ for_each_node(k) {
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -8028,7 +8056,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -8054,7 +8082,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg;
err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
@@ -8074,17 +8102,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
{
/* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
}
-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
/* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}
void sched_offline_group(struct task_group *tg)
@@ -8545,31 +8572,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
+ sched_online_group(tg, parent);
+
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
- if (parent)
- sched_online_group(tg, parent);
- return 0;
+ sched_offline_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task, void *private)
@@ -8929,9 +8951,8 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
- .css_online = cpu_cgroup_css_online,
- .css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 442a9f7a2832..acde1d7c763c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -275,21 +275,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
- cputime_t steal_ct;
+ unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
/*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
+ * steal is in nsecs but our caller is expecting steal
+ * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+ steal_jiffies = nsecs_to_jiffies(steal);
+ this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
- account_steal_time(steal_ct);
- return steal_ct;
+ account_steal_time(jiffies_to_cputime(steal_jiffies));
+ return steal_jiffies;
}
#endif
return false;
@@ -616,19 +616,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -651,7 +657,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b881cf81d79..30d76a18ae1a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -703,8 +703,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -1209,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1278,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1375,6 +1381,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1400,9 +1407,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2699,6 +2713,23 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
@@ -2707,15 +2738,15 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2785,10 +2816,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -4711,19 +4742,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ef7159012cf3..b0b93fd33af9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq)
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
}
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 06f15724a639..2f2b959ad244 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2025,3 +2025,16 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d4024f..15a1795bbba1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void)
put_seccomp_filter(thread);
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
/*
* Opt the other thread into seccomp if needed.
* As threads are considered to be trust-realm
* equivalent (see ptrace_may_access), it is safe to
* allow one thread to transition the other.
*/
- if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
- /*
- * Don't let an unprivileged task work around
- * the no_new_privs restriction by creating
- * a thread that sets it up, enters seccomp,
- * then dies.
- */
- if (task_no_new_privs(caller))
- task_set_no_new_privs(thread);
-
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
- }
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c00e563b153..ba3ddb43dd9f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1855,11 +1855,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
}
- if (prctl_map.exe_fd != (u32)-1)
+ if (prctl_map.exe_fd != (u32)-1) {
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
- down_read(&mm->mmap_sem);
- if (error)
- goto out;
+ if (error)
+ return error;
+ }
+
+ down_write(&mm->mmap_sem);
/*
* We don't validate if these members are pointing to
@@ -1896,10 +1898,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- error = 0;
-out:
- up_read(&mm->mmap_sem);
- return error;
+ up_write(&mm->mmap_sem);
+ return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -1965,7 +1965,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
- down_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
vma = find_vma(mm, addr);
prctl_map.start_code = mm->start_code;
@@ -2058,7 +2058,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
- up_read(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
return error;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f3300de62fe4..abb795e8a6f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1839,6 +1839,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -2141,6 +2155,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2268,8 +2297,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2882,6 +2930,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2927,6 +2981,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 7e7746a42a62..10a1d7dc9313 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
}
mnt = task_active_pid_ns(current)->proc_mnt;
- file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
result = PTR_ERR(file);
if (IS_ERR(file))
goto out_putname;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1347882d131e..b98810d2f3b4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+ struct clocksource *cs, *old_wd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ /* save current watchdog */
+ old_wd = watchdog;
+ if (fallback)
+ watchdog = NULL;
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ /* cs is a clocksource to be watched. */
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+
+ /* Skip current if we were requested for a fallback. */
+ if (fallback && cs == old_wd)
+ continue;
+
/* Pick the best watchdog. */
- if (!watchdog || cs->rating > watchdog->rating) {
+ if (!watchdog || cs->rating > watchdog->rating)
watchdog = cs;
- /* Reset watchdog cycles */
- clocksource_reset_watchdog();
- }
}
+ /* If we failed to find a fallback restore the old one. */
+ if (!watchdog)
+ watchdog = old_wd;
+
+ /* If we changed the watchdog we need to reset cycles. */
+ if (watchdog != old_wd)
+ clocksource_reset_watchdog();
+
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
- /*
- * I really can't convince myself to support this on hardware
- * designed by lobotomized monkeys.
- */
- if (clocksource_is_watchdog(cs))
- return -EBUSY;
+ if (clocksource_is_watchdog(cs)) {
+ /* Select and try to install a replacement watchdog. */
+ clocksource_select_watchdog(true);
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+ }
if (cs == curr_clocksource) {
/* Select and try to install a replacement clock source */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c98439fdce81..405536b22c0c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -94,6 +94,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
@@ -102,7 +105,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- return hrtimer_clock_to_base_table[clock_id];
+ int base = hrtimer_clock_to_base_table[clock_id];
+ BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
+ return base;
}
/*
@@ -897,10 +902,10 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ u8 newstate, int reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- unsigned int state = timer->state;
+ u8 state = timer->state;
timer->state = newstate;
if (!(state & HRTIMER_STATE_ENQUEUED))
@@ -930,7 +935,7 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state = timer->state;
+ u8 state = timer->state;
int reprogram;
/*
@@ -954,6 +959,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
return 0;
}
+static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
+ const enum hrtimer_mode mode)
+{
+#ifdef CONFIG_TIME_LOW_RES
+ /*
+ * CONFIG_TIME_LOW_RES indicates that the system has no way to return
+ * granular time values. For relative timers we add hrtimer_resolution
+ * (i.e. one jiffie) to prevent short timeouts.
+ */
+ timer->is_rel = mode & HRTIMER_MODE_REL;
+ if (timer->is_rel)
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+#endif
+ return tim;
+}
+
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
@@ -974,19 +995,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
- if (mode & HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
- /*
- * CONFIG_TIME_LOW_RES is a temporary way for architectures
- * to signal that they simply return xtime in
- * do_gettimeoffset(). In this case we want to round up by
- * resolution when starting a relative timer, to avoid short
- * timeouts. This will go away with the GTOD framework.
- */
-#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
-#endif
- }
+
+ tim = hrtimer_update_lowres(timer, tim, mode);
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
@@ -1074,19 +1086,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
+ * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
-ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;
lock_hrtimer_base(timer, &flags);
- rem = hrtimer_expires_remaining(timer);
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
+ rem = hrtimer_expires_remaining_adjusted(timer);
+ else
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
#ifdef CONFIG_NO_HZ_COMMON
/**
@@ -1220,6 +1236,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
fn = timer->function;
/*
+ * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
+ * timer is restarted with a period then it becomes an absolute
+ * timer. If its not restarted it does not matter.
+ */
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES))
+ timer->is_rel = false;
+
+ /*
* Because we run timers from hardirq context, there is no chance
* they get migrated to another cpu, therefore its safe to unlock
* the timer base.
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8d262b467573..1d5c7204ddc9 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
*/
static struct timeval itimer_get_remtime(struct hrtimer *timer)
{
- ktime_t rem = hrtimer_get_remaining(timer);
+ ktime_t rem = __hrtimer_get_remaining(timer, true);
/*
* Racy but safe: if the itimer expires after the above
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 149cc8086aea..ab861771e37f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -674,8 +674,24 @@ int ntp_validate_timex(struct timex *txc)
return -EINVAL;
}
- if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
- return -EPERM;
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
+ }
/*
* Check for potential multiplication overflows that can
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ce033c7aa2e8..9cff0ab82b63 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
{
struct posix_clock *clk = get_posix_clock(fp);
- int result = 0;
+ unsigned int result = 0;
if (!clk)
- return -ENODEV;
+ return POLLERR;
if (clk->ops.poll)
result = clk->ops.poll(clk, fp, wait);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f5e86d282d52..80016b329d94 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -808,6 +808,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
timer->it.cpu.expires = 0;
sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
&itp->it_value);
+ return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
unlock_task_sighand(p, &flags);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31d11ac9fa47..f2826c35e918 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec4515983..22c57e191a23 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -977,9 +977,9 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- hrtimer_forward_now(&ts->sched_timer, tick_period);
hrtimer_set_expires(&ts->sched_timer, next);
- tick_program_event(next, 1);
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d563c1960302..445601c580d6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -298,13 +298,11 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
{
- cycle_t delta;
s64 nsec;
- delta = timekeeping_get_delta(tkr);
-
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
@@ -312,6 +310,24 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
return nsec + arch_gettimeoffset();
}
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+{
+ cycle_t delta;
+
+ delta = timekeeping_get_delta(tkr);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+ cycle_t cycles)
+{
+ cycle_t delta;
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
@@ -384,7 +400,13 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -959,7 +981,7 @@ int timekeeping_inject_offset(struct timespec *ts)
struct timespec64 ts64, tmp;
int ret = 0;
- if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_inject_offset_valid(ts))
return -EINVAL;
ts64 = timespec_to_timespec64(*ts);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
#include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
{
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
void tk_debug_account_sleep_time(struct timespec64 *t)
{
- sleep_time_bin[fls(t->tv_sec)]++;
+ /* Cap bin index so we don't overflow the array */
+ int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+ sleep_time_bin[bin]++;
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f75e35b60149..ba7d8b288bb3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
- SEQ_printf(m, ", S:%02lx", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->state);
#ifdef CONFIG_TIMER_STATS
SEQ_printf(m, ", ");
print_name_offset(m, timer->start_site);
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ba04bf0c2653..a9bba37fab5a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27ba3..acbb0e73d3a2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -437,7 +437,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
- unsigned int nr_pages;
+ unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
@@ -458,7 +458,7 @@ struct ring_buffer_per_cpu {
u64 write_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
- int nr_pages_to_update;
+ long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
@@ -1137,10 +1137,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
{
- int i;
struct buffer_page *bpage, *tmp;
+ long i;
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1177,7 +1177,7 @@ free_pages:
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+ unsigned long nr_pages)
{
LIST_HEAD(pages);
@@ -1202,7 +1202,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1302,8 +1302,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
struct ring_buffer *buffer;
+ long nr_pages;
int bsize;
- int cpu, nr_pages;
+ int cpu;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1429,12 +1430,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
}
static int
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
struct buffer_page *to_remove_page, *tmp_iter_page;
struct buffer_page *last_page, *first_page;
- unsigned int nr_removed;
+ unsigned long nr_removed;
unsigned long head_bit;
int page_entries;
@@ -1651,7 +1652,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages;
+ unsigned long nr_pages;
int cpu, err = 0;
/*
@@ -1665,14 +1666,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return size;
- size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- size *= BUF_PAGE_SIZE;
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
/* we need a minimum of two pages */
- if (size < BUF_PAGE_SIZE * 2)
- size = BUF_PAGE_SIZE * 2;
+ if (nr_pages < 2)
+ nr_pages = 2;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size = nr_pages * BUF_PAGE_SIZE;
/*
* Don't succeed if resizing is disabled, as a reader might be
@@ -4645,8 +4645,9 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
- int cpu_i, nr_pages_same;
- unsigned int nr_pages;
+ long nr_pages_same;
+ int cpu_i;
+ unsigned long nr_pages;
switch (action) {
case CPU_UP_PREPARE:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5f375d4c05fb..bff7d4c69274 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1771,7 +1771,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
@@ -4816,19 +4816,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
struct trace_iterator *iter = filp->private_data;
ssize_t sret;
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
-
- trace_seq_init(&iter->seq);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
mutex_lock(&iter->mutex);
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ goto out;
+
+ trace_seq_init(&iter->seq);
+
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
@@ -5038,7 +5039,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
spd.nr_pages = i;
- ret = splice_to_pipe(pipe, &spd);
+ if (i)
+ ret = splice_to_pipe(pipe, &spd);
+ else
+ ret = 0;
out:
splice_shrink_spd(&spd);
return ret;
@@ -5852,9 +5856,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -5864,6 +5865,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5921,19 +5925,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
if (ret)
- return ret;
+ goto out;
+ ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
- return -EAGAIN;
+ goto out;
ret = wait_on_pipe(iter, true);
if (ret)
- return ret;
+ goto out;
goto again;
}
ret = splice_to_pipe(pipe, &spd);
+out:
splice_shrink_spd(&spd);
return ret;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4f6ef6912e00..996f0fd34312 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
- field = __find_event_field(&ftrace_generic_fields, name);
+ head = trace_get_fields(call);
+ field = __find_event_field(head, name);
if (field)
return field;
- field = __find_event_field(&ftrace_common_fields, name);
+ field = __find_event_field(&ftrace_generic_fields, name);
if (field)
return field;
- head = trace_get_fields(call);
- return __find_event_field(head, name);
+ return __find_event_field(&ftrace_common_fields, name);
}
static int __trace_define_field(struct list_head *head, const char *type,
@@ -171,8 +171,10 @@ static int trace_define_generic_fields(void)
{
int ret;
- __generic_field(int, cpu, FILTER_OTHER);
- __generic_field(char *, comm, FILTER_PTR_STRING);
+ __generic_field(int, CPU, FILTER_CPU);
+ __generic_field(int, cpu, FILTER_CPU);
+ __generic_field(char *, COMM, FILTER_COMM);
+ __generic_field(char *, comm, FILTER_COMM);
return ret;
}
@@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->class && call->class->reg)
+ if (call->class && call->class->reg &&
+ !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
return file;
}
@@ -2104,8 +2107,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("filter", 0644, file->dir, file,
&ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ /*
+ * Only event directories that can be enabled should have
+ * triggers.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f93a219b18da..6816302542b2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps,
return -EINVAL;
}
- if (is_string_field(field)) {
+ if (field->filter_type == FILTER_COMM) {
+ filter_build_regex(pred);
+ fn = filter_pred_comm;
+ pred->regex.field_len = TASK_COMM_LEN;
+ } else if (is_string_field(field)) {
filter_build_regex(pred);
- if (!strcmp(field->name, "comm")) {
- fn = filter_pred_comm;
- pred->regex.field_len = TASK_COMM_LEN;
- } else if (field->filter_type == FILTER_STATIC_STRING) {
+ if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
- if (!strcmp(field->name, "cpu"))
+ if (field->filter_type == FILTER_CPU)
fn = filter_pred_cpu;
else
fn = select_comparison_fn(pred->op, field->size,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index e4e56589ec1d..be3222b7d72e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
local_save_flags(*flags);
- /* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(*flags))
+ /*
+ * Slight chance to get a false positive on tracing_cpu,
+ * although I'm starting to think there isn't a chance.
+ * Leave this for now just to be paranoid.
+ */
+ if (!irqs_disabled_flags(*flags) && !preempt_count())
return 0;
*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 060df67dbdd1..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
{
struct trace_bprintk_fmt *pos;
+
+ if (!fmt)
+ return ERR_PTR(-EINVAL);
+
list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
if (!strcmp(pos->fmt, fmt))
return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
if (tb_fmt) {
- *iter = tb_fmt->fmt;
+ if (!IS_ERR(tb_fmt))
+ *iter = tb_fmt->fmt;
continue;
}
@@ -296,6 +301,9 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
+ if (!*fmt)
+ return 0;
+
seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index dda9e6742950..202df6cffcca 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack)
}
/*
+ * Some archs may not have the passed in ip in the dump.
+ * If that happens, we need to show everything.
+ */
+ if (i == stack_trace_max.nr_entries)
+ i = 0;
+
+ /*
* Now find where in the stack these are.
*/
x = 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e864906af3fc..1f1b05f5a94b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -1020,6 +1020,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* both lockup detectors are disabled if proc_watchdog_update()
* returns an error.
*/
+ if (old == new)
+ goto out;
+
err = proc_watchdog_update();
}
out:
@@ -1064,7 +1067,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old;
+ int err, old, new;
get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
@@ -1084,6 +1087,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
/*
* Update the sample period. Restore on failure.
*/
+ new = ACCESS_ONCE(watchdog_thresh);
+ if (old == new)
+ goto out;
+
set_sample_period();
err = proc_watchdog_update();
if (err) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c579dbab2e36..2c2f971f3e75 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -568,6 +568,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+ /*
+ * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+ * delayed item is pending. The plan is to keep CPU -> NODE
+ * mapping valid and stable across CPU on/offlines. Once that
+ * happens, this workaround can be removed.
+ */
+ if (unlikely(node == NUMA_NO_NODE))
+ return wq->dfl_pwq;
+
return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
@@ -639,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)
@@ -1458,13 +1497,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
dwork->wq = wq;
- /* timer isn't guaranteed to run in this cpu, record earlier */
- if (cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- add_timer_on(timer, cpu);
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
}
/**
@@ -4418,6 +4457,17 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+
+ /*
+ * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+ * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
+ * being reworked and this can go away in time.
+ */
+ if (!(pool->flags & POOL_DISASSOCIATED)) {
+ spin_unlock_irq(&pool->lock);
+ return;
+ }
+
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {