From 0d57cbc66b9018f548f6f0dd3b1aff8bf1a214d2 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 1 Sep 2016 16:14:44 -0700 Subject: kexec: fix double-free when failing to relocate the purgatory commit 070c43eea5043e950daa423707ae3c77e2f48edb upstream. If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs and pi->purgatory_buf. This is redundant, because in case of error kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which will also free those buffers. This causes two warnings like the following, one for pi->sechdrs and the other for pi->purgatory_buf: kexec-bzImage64: Loading purgatory failed ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0 Trying to vfree() nonexistent vm area (ffffc90000e91000) Modules linked in: CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4d/0x65 __warn+0xcb/0xf0 warn_slowpath_fmt+0x4f/0x60 ? find_vmap_area+0x19/0x70 ? kimage_file_post_load_cleanup+0x47/0xb0 __vunmap+0xc1/0xd0 vfree+0x2e/0x70 kimage_file_post_load_cleanup+0x5e/0xb0 SyS_kexec_file_load+0x448/0x680 ? putname+0x54/0x60 ? do_sys_open+0x190/0x1f0 entry_SYSCALL_64_fastpath+0x13/0x8f ---[ end trace 158bb74f5950ca2b ]--- Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree won't try to free a NULL pointer. Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com Signed-off-by: Thiago Jung Bauermann Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0028d2..6030efd4a188 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } -- cgit v1.2.3 From f750847daa22bee9cbb3309f11e8c2eef7bbe5c6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:38 +0200 Subject: mm: introduce get_task_exe_file commit cd81a9170e69e018bbaba547c1fd85a585f5697a upstream. For more convenient access if one has a pointer to the task. As a minor nit take advantage of the fact that only task lock + rcu are needed to safely grab ->exe_file. This saves mm refcount dance. Use the helper in proc_exe_link. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c485cb156772..8860d1f50d24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -763,6 +763,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) } EXPORT_SYMBOL(get_mm_exe_file); +/** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.2.3 From d0259cc85b4c37930c102c21c03f64b5afa25792 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:39 +0200 Subject: audit: fix exe_file access in audit_exe_compare commit 5efc244346f9f338765da3d592f7947b0afdc4b5 upstream. Prior to the change the function would blindly deference mm, exe_file and exe_file->f_inode, each of which could have been NULL or freed. Use get_task_exe_file to safely obtain stable exe_file. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/audit_watch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..939945a5649c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } -- cgit v1.2.3 From 06ec7a1d7646833cac76516fe78a23577cdb4a8a Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Tue, 9 Aug 2016 11:25:01 +0800 Subject: cpuset: make sure new tasks conform to the current config of the cpuset commit 06f4e94898918bcad00cdd4d349313a439d6911e upstream. A new task inherits cpus_allowed and mems_allowed masks from its parent, but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems before this new task is inserted into the cgroup's task list, the new task won't be updated accordingly. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 11eaf14b52c2..e120bd983ad0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2074,6 +2074,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2084,6 +2098,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = 1, }; -- cgit v1.2.3 From a37a538e82b037b51ad8418c5c33d1274cfe27d4 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 5 Sep 2016 13:16:40 +1000 Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task commit 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf upstream. The origin of the issue I've seen is related to a missing memory barrier between check for task->state and the check for task->on_rq. The task being woken up is already awake from a schedule() and is doing the following: do { schedule() set_current_state(TASK_(UN)INTERRUPTIBLE); } while (!cond); The waker, actually gets stuck doing the following in try_to_wake_up(): while (p->on_cpu) cpu_relax(); Analysis: The instance I've seen involves the following race: CPU1 CPU2 while () { if (cond) break; do { schedule(); set_current_state(TASK_UN..) } while (!cond); wakeup_routine() spin_lock_irqsave(wait_lock) raw_spin_lock_irqsave(wait_lock) wake_up_process() } try_to_wake_up() set_current_state(TASK_RUNNING); .. list_del(&waiter.list); CPU2 wakes up CPU1, but before it can get the wait_lock and set current state to TASK_RUNNING the following occurs: CPU3 wakeup_routine() raw_spin_lock_irqsave(wait_lock) if (!list_empty) wake_up_process() try_to_wake_up() raw_spin_lock_irqsave(p->pi_lock) .. if (p->on_rq && ttwu_wakeup()) .. while (p->on_cpu) cpu_relax() .. CPU3 tries to wake up the task on CPU1 again since it finds it on the wait_queue, CPU1 is spinning on wait_lock, but immediately after CPU2, CPU3 got it. CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and the task is spinning on the wait_lock. Interestingly since p->on_rq is checked under pi_lock, I've noticed that try_to_wake_up() finds p->on_rq to be 0. This was the most confusing bit of the analysis, but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq check is not reliable without this fix IMHO. The race is visible (based on the analysis) only when ttwu_queue() does a remote wakeup via ttwu_queue_remote. In which case the p->on_rq change is not done uder the pi_lock. The result is that after a while the entire system locks up on the raw_spin_irqlock_save(wait_lock) and the holder spins infintely Reproduction of the issue: The issue can be reproduced after a long run on my system with 80 threads and having to tweak available memory to very low and running memory stress-ng mmapfork test. It usually takes a long time to reproduce. I am trying to work on a test case that can reproduce the issue faster, but thats work in progress. I am still testing the changes on my still in a loop and the tests seem OK thus far. Big thanks to Benjamin and Nick for helping debug this as well. Ben helped catch the missing barrier, Nick caught every missing bit in my theory. Signed-off-by: Balbir Singh [ Updated comment to clarify matching barriers. Many architectures do not have a full barrier in switch_to() so that cannot be relied upon. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Benjamin Herrenschmidt Cc: Alexey Kardashevskiy Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea863bc22caf..20253dbc8610 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1945,6 +1945,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; -- cgit v1.2.3 From 268a8bf90ee36d159bdb73079dec9c0e83e362ef Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Sep 2016 16:14:47 -0700 Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings commit 236dec051078a8691950f56949612b4b74107e48 upstream. Using "make tinyconfig" produces a couple of annoying warnings that show up for build test machines all the time: .config:966:warning: override: NOHIGHMEM changes choice state .config:965:warning: override: SLOB changes choice state .config:963:warning: override: KERNEL_XZ changes choice state .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:933:warning: override: SLOB changes choice state .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:870:warning: override: SLOB changes choice state .config:868:warning: override: KERNEL_XZ changes choice state .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state I've made a previous attempt at fixing them and we discussed a number of alternatives. I tried changing the Makefile to use "merge_config.sh -n $(fragment-list)" but couldn't get that to work properly. This is yet another approach, based on the observation that we do want to see a warning for conflicting 'choice' options, and that we can simply make them non-conflicting by listing all other options as disabled. This is a trivial patch that we can apply independent of plans for other changes. Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log https://patchwork.kernel.org/patch/9212749/ Signed-off-by: Arnd Bergmann Reviewed-by: Josh Triplett Reviewed-by: Masahiro Yamada Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/configs/tiny.config | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y -- cgit v1.2.3 From 2fbc61d977c4ad1d448c67fa56d8c689121e7f19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 15:32:25 +0200 Subject: genirq/msi: Fix broken debug output commit 4364e1a29be16b2783c0bcbc263f61236af64281 upstream. virq is not required to be the same for all msi descs. Use the base irq number from the desc in the debug printk. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4b21779d5163..cd6009006510 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -298,6 +298,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else -- cgit v1.2.3 From 29bd03596e9511fa2f1a199d3a7603da28b32899 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 24 Sep 2016 23:29:51 +0200 Subject: Fix build warning in kernel/cpuset.c > 2 ../kernel/cpuset.c:2101:11: warning: initialization from incompatible pointer type [-Wincompatible-pointer-types] > 1 ../kernel/cpuset.c:2101:2: warning: initialization from incompatible pointer type > 1 ../kernel/cpuset.c:2101:2: warning: (near initialization for 'cpuset_cgrp_subsys.fork') This got introduced by 06ec7a1d7646 ("cpuset: make sure new tasks conform to the current config of the cpuset"). In the upstream kernel, the function prototype was changed as of b53202e63089 ("cgroup: kill cgrp_ss_priv[CGROUP_CANFORK_COUNT] and friends"). That patch is not suitable for stable kernels, and fortunately the warning seems harmless as the prototypes only differ in the second argument that is unused. Adding that argument gets rid of the warning: Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e120bd983ad0..b9279a2844d8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2079,7 +2079,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ -void cpuset_fork(struct task_struct *task) +void cpuset_fork(struct task_struct *task, void *priv) { if (task_css_is_root(task, cpuset_cgrp_id)) return; -- cgit v1.2.3 From a52031beb067ed3318c3ef226af5fa7223a7787d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Jul 2016 22:30:43 -0400 Subject: Makefile: Mute warning for __builtin_return_address(>0) for tracing only commit 377ccbb483738f84400ddf5840c7dd8825716985 upstream. With the latest gcc compilers, they give a warning if __builtin_return_address() parameter is greater than 0. That is because if it is used by a function called by a top level function (or in the case of the kernel, by assembly), it can try to access stack frames outside the stack and crash the system. The tracing system uses __builtin_return_address() of up to 2! But it is well aware of the dangers that it may have, and has even added precautions to protect against it (see the thunk code in arch/x86/entry/thunk*.S) Linus originally added KBUILD_CFLAGS that would suppress the warning for the entire kernel, as simply adding KBUILD_CFLAGS to the tracing directory wouldn't work. The tracing directory plays a bit with the CFLAGS and requires a little more logic. This adds that special logic to only suppress the warning for the tracing directory. If it is used anywhere else outside of tracing, the warning will still be triggered. Link: http://lkml.kernel.org/r/20160728223043.51996267@grimm.local.home Tested-by: Linus Torvalds Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..05ea5167e6bb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,4 +1,8 @@ +# We are fully aware of the dangers of __builtin_return_address() +FRAME_CFLAGS := $(call cc-disable-warning,frame-address) +KBUILD_CFLAGS += $(FRAME_CFLAGS) + # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER -- cgit v1.2.3 From 369796a8843ec9fce05129c6a80bee9e4b90978b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 17 Sep 2016 18:31:46 -0400 Subject: fix memory leaks in tracing_buffers_splice_read() commit 1ae2293dd6d2f5c823cf97e60b70d03631cd622f upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8305cbb2d5a2..3a44903169ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5766,9 +5766,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -5778,6 +5775,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -5835,19 +5835,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; -- cgit v1.2.3 From 8b275b4522af2b19072df7aee87fd659e09d29bc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 23 Sep 2016 22:57:13 -0400 Subject: tracing: Move mutex to protect against resetting of seq data commit 1245800c0f96eb6ebb368593e251d66c01e61022 upstream. The iter->seq can be reset outside the protection of the mutex. So can reading of user data. Move the mutex up to the beginning of the function. Fixes: d7350c3f45694 ("tracing/core: make the read callbacks reentrants") Reported-by: Al Viro Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a44903169ec..059233abcfcf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4727,19 +4727,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) -- cgit v1.2.3 From 119c348a8ef2765fb19c96f76480d4aaefdcae23 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 11 Aug 2016 14:49:29 -0700 Subject: PM / hibernate: Restore processor state before using per-CPU variables commit 62822e2ec4ad091ba31f823f577ef80db52e3c2c upstream. Restore the processor state before calling any other functions to ensure per-CPU variables can be used with KASLR memory randomization. Tracing functions use per-CPU variables (GS based on x86) and one was called just before restoring the processor state fully. It resulted in a double fault when both the tracing & the exception handler functions tried to use a per-CPU variable. Fixes: bb3632c6101b (PM / sleep: trace events for suspend/resume) Reported-and-tested-by: Borislav Petkov Reported-by: Jiri Kosina Tested-by: Rafael J. Wysocki Tested-by: Jiri Kosina Signed-off-by: Thomas Garnier Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7dd5718836e..3124cebaec31 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -299,12 +299,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; -- cgit v1.2.3 From 116fcd882e3b6dc3aeb0fd29f7c93b5628337bc3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Aug 2016 10:46:38 +0100 Subject: PM / hibernate: Fix rtree_next_node() to avoid walking off list ends commit 924d8696751c4b9e58263bc82efdafcf875596a6 upstream. rtree_next_node() walks the linked list of leaf nodes to find the next block of pages in the struct memory_bitmap. If it walks off the end of the list of nodes, it walks the list of memory zones to find the next region of memory. If it walks off the end of the list of zones, it returns false. This leaves the struct bm_position's node and zone pointers pointing at their respective struct list_heads in struct mem_zone_bm_rtree. memory_bm_find_bit() uses struct bm_position's node and zone pointers to avoid walking lists and trees if the next bit appears in the same node/zone. It handles these values being stale. Swap rtree_next_node()s 'step then test' to 'test-next then step', this means if we reach the end of memory we return false and leave the node and zone pointers as they were. This fixes a panic on resume using AMD Seattle with 64K pages: [ 6.868732] Freezing user space processes ... (elapsed 0.000 seconds) done. [ 6.875753] Double checking all user space processes after OOM killer disable... (elapsed 0.000 seconds) [ 6.896453] PM: Using 3 thread(s) for decompression. [ 6.896453] PM: Loading and decompressing image data (5339 pages)... [ 7.318890] PM: Image loading progress: 0% [ 7.323395] Unable to handle kernel paging request at virtual address 00800040 [ 7.330611] pgd = ffff000008df0000 [ 7.334003] [00800040] *pgd=00000083fffe0003, *pud=00000083fffe0003, *pmd=00000083fffd0003, *pte=0000000000000000 [ 7.344266] Internal error: Oops: 96000005 [#1] PREEMPT SMP [ 7.349825] Modules linked in: [ 7.352871] CPU: 2 PID: 1 Comm: swapper/0 Tainted: G W I 4.8.0-rc1 #4737 [ 7.360512] Hardware name: AMD Overdrive/Supercharger/Default string, BIOS ROD1002C 04/08/2016 [ 7.369109] task: ffff8003c0220000 task.stack: ffff8003c0280000 [ 7.375020] PC is at set_bit+0x18/0x30 [ 7.378758] LR is at memory_bm_set_bit+0x24/0x30 [ 7.383362] pc : [] lr : [] pstate: 60000045 [ 7.390743] sp : ffff8003c0283b00 [ 7.473551] [ 7.475031] Process swapper/0 (pid: 1, stack limit = 0xffff8003c0280020) [ 7.481718] Stack: (0xffff8003c0283b00 to 0xffff8003c0284000) [ 7.800075] Call trace: [ 7.887097] [] set_bit+0x18/0x30 [ 7.891876] [] duplicate_memory_bitmap.constprop.38+0x54/0x70 [ 7.899172] [] snapshot_write_next+0x22c/0x47c [ 7.905166] [] load_image_lzo+0x754/0xa88 [ 7.910725] [] swsusp_read+0x144/0x230 [ 7.916025] [] load_image_and_restore+0x58/0x90 [ 7.922105] [] software_resume+0x2f0/0x338 [ 7.927752] [] do_one_initcall+0x38/0x11c [ 7.933314] [] kernel_init_freeable+0x14c/0x1ec [ 7.939395] [] kernel_init+0x10/0xfc [ 7.944520] [] ret_from_fork+0x10/0x40 [ 7.949820] Code: d2800022 8b400c21 f9800031 9ac32043 (c85f7c22) [ 7.955909] ---[ end trace 0024a5986e6ff323 ]--- [ 7.960529] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Here struct mem_zone_bm_rtree's start_pfn has been returned instead of struct rtree_node's addr as the node/zone pointers are corrupt after we walked off the end of the lists during mark_unsafe_pages(). This behaviour was exposed by commit 6dbecfd345a6 ("PM / hibernate: Simplify mark_unsafe_pages()"), which caused mark_unsafe_pages() to call duplicate_memory_bitmap(), which uses memory_bm_find_bit() after walking off the end of the memory bitmap. Fixes: 3a20cb177961 (PM / Hibernate: Implement position keeping in radix tree) Signed-off-by: James Morse [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/snapshot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..f155c62f1f2c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; -- cgit v1.2.3 From 8132ffc977a4d4572b57362bce70e7e4405bc081 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Sun, 11 Sep 2016 21:14:58 -0700 Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work commit 28b89b9e6f7b6c8fef7b3af39828722bca20cfee upstream. A discrepancy between cpu_online_mask and cpuset's effective_cpus mask is inevitable during hotplug since cpuset defers updating of effective_cpus mask using a workqueue, during which time nothing prevents the system from more hotplug operations. For that reason guarantee_online_cpus() walks up the cpuset hierarchy until it finds an intersection under the assumption that top cpuset's effective_cpus mask intersects with cpu_online_mask even with such a race occurring. However a sequence of CPU hotplugs can open a time window, during which none of the effective CPUs in the top cpuset intersect with cpu_online_mask. For example when there are 4 possible CPUs 0-3 and only CPU0 is online: ======================== =========================== cpu_online_mask top_cpuset.effective_cpus ======================== =========================== echo 1 > cpu2/online. CPU hotplug notifier woke up hotplug work but not yet scheduled. [0,2] [0] echo 0 > cpu0/online. The workqueue is still runnable. [2] [0] ======================== =========================== Now there is no intersection between cpu_online_mask and top_cpuset.effective_cpus. Thus invoking sys_sched_setaffinity() at this moment can cause following: Unable to handle kernel NULL pointer dereference at virtual address 000000d0 ------------[ cut here ]------------ Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable] Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 1420 Comm: taskset Tainted: G W 4.4.8+ #98 task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000 PC is at guarantee_online_cpus+0x2c/0x58 LR is at cpuset_cpus_allowed+0x4c/0x6c Process taskset (pid: 1420, stack limit = 0xffffffc06e124020) Call trace: [] guarantee_online_cpus+0x2c/0x58 [] cpuset_cpus_allowed+0x4c/0x6c [] sched_setaffinity+0xc0/0x1ac [] SyS_sched_setaffinity+0x98/0xac [] el0_svc_naked+0x24/0x28 The top cpuset's effective_cpus are guaranteed to be identical to cpu_online_mask eventually. Hence fall back to cpu_online_mask when there is no intersection between top cpuset's effective_cpus and cpu_online_mask. Signed-off-by: Joonwoo Park Acked-by: Li Zefan Cc: Tejun Heo Cc: cgroups@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b9279a2844d8..b271353d5202 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -324,8 +324,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -334,8 +333,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } -- cgit v1.2.3 From 711a78a79254470731f2d5f83c22d68c1951ea11 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 14 Jun 2016 16:10:41 +0100 Subject: perf/core: Fix pmu::filter_match for SW-led groups commit 2c81a6477081966fe80b8c6daa68459bca896774 upstream. The following commit: 66eb579e66ec ("perf: allow for PMU-specific event filtering") added the pmu::filter_match() callback. This was intended to avoid HW constraints on events from resulting in extremely pessimistic scheduling. However, pmu::filter_match() is only called for the leader of each event group. When the leader is a SW event, we do not filter the groups, and may fail at pmu::add() time, and when this happens we'll give up on scheduling any event groups later in the list until they are rotated ahead of the failing group. This can result in extremely sub-optimal event scheduling behaviour, e.g. if running the following on a big.LITTLE platform: $ taskset -c 0 ./perf stat \ -e 'a57{context-switches,armv8_cortex_a57/config=0x11/}' \ -e 'a53{context-switches,armv8_cortex_a53/config=0x11/}' \ ls context-switches (0.00%) armv8_cortex_a57/config=0x11/ (0.00%) 24 context-switches (37.36%) 57589154 armv8_cortex_a53/config=0x11/ (37.36%) Here the 'a53' event group was always eligible to be scheduled, but the 'a57' group never eligible to be scheduled, as the task was always affine to a Cortex-A53 CPU. The SW (group leader) event in the 'a57' group was eligible, but the HW event failed at pmu::add() time, resulting in ctx_flexible_sched_in giving up on scheduling further groups with HW events. One way of avoiding this is to check pmu::filter_match() on siblings as well as the group leader. If any of these fail their pmu::filter_match() call, we must skip the entire group before attempting to add any events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: 66eb579e66ec ("perf: allow for PMU-specific event filtering") Link: http://lkml.kernel.org/r/1465917041-15339-1-git-send-email-mark.rutland@arm.com [ Small readability edits. ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 12ecd4f0329f..bc6371b0e4fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1539,12 +1539,33 @@ static int __init perf_workqueue_init(void) core_initcall(perf_workqueue_init); -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { -- cgit v1.2.3 From 6b502c1d73b49bd425f363765330da7fd7d46a20 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Thu, 25 Aug 2016 15:17:00 -0700 Subject: printk: fix parsing of "brl=" option commit ae6c33ba6e37eea3012fe2640b22400ef3f2d0f3 upstream. Commit bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files") moved the parsing of braille-related options into _braille_console_setup(), changing the type of variable str from char* to char**. In this commit, memcmp(str, "brl,", 4) was correctly updated to memcmp(*str, "brl,", 4) but not memcmp(str, "brl=", 4). Update the code to make "brl=" option work again and replace memcmp() with strncmp() to make the compiler able to detect such an issue. Fixes: bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files") Link: http://lkml.kernel.org/r/20160823165700.28952-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/printk/braille.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) -- cgit v1.2.3 From 70cd763eb1574cac07138be91f474a661e02d694 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 25 Aug 2016 15:16:51 -0700 Subject: sysctl: handle error writing UINT_MAX to u32 fields commit e7d316a02f683864a12389f8808570e37fb90aa3 upstream. We have scripts which write to certain fields on 3.18 kernels but this seems to be failing on 4.4 kernels. An entry which we write to here is xfrm_aevent_rseqth which is u32. echo 4294967295 > /proc/sys/net/core/xfrm_aevent_rseqth Commit 230633d109e3 ("kernel/sysctl.c: detect overflows when converting to int") prevented writing to sysctl entries when integer overflow occurs. However, this does not apply to unsigned integers. Heinrich suggested that we introduce a new option to handle 64 bit limits and set min as 0 and max as UINT_MAX. This might not work as it leads to issues similar to __do_proc_doulongvec_minmax. Alternatively, we would need to change the datatype of the entry to 64 bit. static int __do_proc_doulongvec_minmax(void *data, struct ctl_table { i = (unsigned long *) data; //This cast is causing to read beyond the size of data (u32) vleft = table->maxlen / sizeof(unsigned long); //vleft is 0 because maxlen is sizeof(u32) which is lesser than sizeof(unsigned long) on x86_64. Introduce a new proc handler proc_douintvec. Individual proc entries will need to be updated to use the new handler. [akpm@linux-foundation.org: coding-style fixes] Fixes: 230633d109e3 ("kernel/sysctl.c:detect overflows when converting to int") Link: http://lkml.kernel.org/r/1471479806-5252-1-git-send-email-subashab@codeaurora.org Signed-off-by: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2781141a89f9..999e025bf68e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2051,6 +2051,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2178,8 +2193,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2792,6 +2826,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2837,6 +2877,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -- cgit v1.2.3 From 82b7839a4063855b666f5bd2d309871cbbe21eff Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 1 Sep 2016 16:15:13 -0700 Subject: kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd commit 735f2770a770156100f534646158cb58cb8b2939 upstream. Commit fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") has caused a subtle regression in nscd which uses CLONE_CHILD_CLEARTID to clear the nscd_certainly_running flag in the shared databases, so that the clients are notified when nscd is restarted. Now, when nscd uses a non-persistent database, clients that have it mapped keep thinking the database is being updated by nscd, when in fact nscd has created a new (anonymous) one (for non-persistent databases it uses an unlinked file as backend). The original proposal for the CLONE_CHILD_CLEARTID change claimed (https://lkml.org/lkml/2006/10/25/233): : The NPTL library uses the CLONE_CHILD_CLEARTID flag on clone() syscalls : on behalf of pthread_create() library calls. This feature is used to : request that the kernel clear the thread-id in user space (at an address : provided in the syscall) when the thread disassociates itself from the : address space, which is done in mm_release(). : : Unfortunately, when a multi-threaded process incurs a core dump (such as : from a SIGSEGV), the core-dumping thread sends SIGKILL signals to all of : the other threads, which then proceed to clear their user-space tids : before synchronizing in exit_mm() with the start of core dumping. This : misrepresents the state of process's address space at the time of the : SIGSEGV and makes it more difficult for someone to debug NPTL and glibc : problems (misleading him/her to conclude that the threads had gone away : before the fault). : : The fix below is to simply avoid the CLONE_CHILD_CLEARTID action if a : core dump has been initiated. The resulting patch from Roland (https://lkml.org/lkml/2006/10/26/269) seems to have a larger scope than the original patch asked for. It seems that limitting the scope of the check to core dumping should work for SIGSEGV issue describe above. [Changelog partly based on Andreas' description] Fixes: fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") Link: http://lkml.kernel.org/r/1471968749-26173-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: William Preston Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8860d1f50d24..7161ebe67cbb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -901,14 +901,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has -- cgit v1.2.3 From 09c5f91afe959ddcf795ec0479b5c032192b4aae Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 19 Sep 2016 14:37:34 -0700 Subject: eas/sched/fair: Fixing comments in find_best_target. Change-Id: I83f5b9887e98f9fdb81318cde45408e7ebfc4b13 Signed-off-by: Srinath Sridharan --- kernel/sched/fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf56241ac262..14c06f92184c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5710,17 +5710,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if(prefer_idle) { - // Find a target cpu with lowest - // utilization. + if (prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ if (target_util == 0 || target_util < new_util) { target_cpu = i; target_util = new_util; } } else { - // Find a target cpu with highest - // utilization. + /* Find a target cpu with lowest + * utilization. + */ if (target_util == 0 || target_util > new_util) { target_cpu = i; -- cgit v1.2.3 From 27f0430c615865fc8070e4bba4254f950ad31d7e Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Sep 2016 17:00:47 +0100 Subject: sched/walt: Drop arch-specific timer access On at least one platform, occasionally the timer providing the wallclock was able to be reset/go backwards for at least some time after wakeup. Accept that this might happen and warn the first time, but otherwise just carry on. Change-Id: Id3164477ba79049561af7f0889cbeebc199ead4e Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 07b7f84b37e2..2ffb1680b380 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "sched.h" #include "walt.h" @@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock) delta = wallclock - rq->window_start; /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ if (delta < 0) { - if (arch_timer_read_counter() == 0) - delta = 0; - else - BUG_ON(1); + delta = 0; + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n"); } if (delta < walt_ravg_window) -- cgit v1.2.3 From ca4b83c5476f68cb0ead77c23775e7ef68a37058 Mon Sep 17 00:00:00 2001 From: Caesar Wang Date: Tue, 23 Aug 2016 11:47:02 +0100 Subject: sched/fair: remove printk while schedule is in progress It will cause deadlock and while(1) if call printk while schedule is in progress. The block state like as below: cpu0(hold the console sem): printk->console_unlock->up_sem->spin_lock(&sem->lock)->wake_up_process(cpu1) ->try_to_wake_up(cpu1)->while(p->on_cpu). cpu1(request console sem): console_lock->down_sem->schedule->idle_banlance->update_cpu_capacity-> printk->console_trylock->spin_lock(&sem->lock). p->on_cpu will be 1 forever, because the task is still running on cpu1, so cpu0 is blocked in while(p->on_cpu), but cpu1 could not get spin_lock(&sem->lock), it is blocked too, it means the task will running on cpu1 forever. Signed-off-by: Caesar Wang --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14c06f92184c..30d76a18ae1a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7125,7 +7125,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) mcc->cpu = cpu; #ifdef CONFIG_SCHED_DEBUG raw_spin_unlock_irqrestore(&mcc->lock, flags); - pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); goto skip_unlock; #endif } -- cgit v1.2.3 From fc1d6c8c6a6e7353147bcb02d0236db5714804d2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 20 Sep 2016 18:42:22 -0700 Subject: sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE to set ENERGY_AWARE feature flag The ENERGY_AWARE sched feature flag cannot be set unless CONFIG_SCHED_DEBUG is enabled. So this patch allows the flag to default to true at build time if the config is set. Change-Id: I8835a571fdb7a8f8ee6a54af1e11a69f3b5ce8e6 Signed-off-by: John Stultz --- kernel/sched/features.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index b634151ce286..55e461055332 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) * Energy aware scheduling. Use platform energy model to guide scheduling * decisions optimizing for energy efficiency. */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE +SCHED_FEAT(ENERGY_AWARE, true) +#else SCHED_FEAT(ENERGY_AWARE, false) +#endif -- cgit v1.2.3 From f885566c5eaed6f3e0746669f9e0997a1a3323de Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 30 Aug 2016 17:19:13 -0400 Subject: BACKPORT: audit: consistently record PIDs with task_tgid_nr() Unfortunately we record PIDs in audit records using a variety of methods despite the correct way being the use of task_tgid_nr(). This patch converts all of these callers, except for the case of AUDIT_SET in audit_receive_msg() (see the comment in the code). Reported-by: Jeff Vander Stoep Signed-off-by: Paul Moore Bug: 28952093 (cherry picked from commit fa2bea2f5cca5b8d4a3e5520d2e8c0ede67ac108) Signed-off-by: Jeff Vander Stoep Change-Id: If6645f9de8bc58ed9755f28dc6af5fbf08d72a00 --- kernel/audit.c | 8 +++++++- kernel/auditsc.c | 12 ++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..34f690b9213a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) @@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 48f45987dc6c..63f0e495f517 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", oldloginuid, loginuid, oldsessionid, sessionid, !rc); @@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } -- cgit v1.2.3 From 9b57d91c03054030f30be75c6e19c65b5f108ef3 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:19 -0800 Subject: time: Add cycles to nanoseconds translation commit 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 upstream. The timekeeping code does not currently provide a way to translate externally provided clocksource cycles to system time. The cycle count is always provided by the result clocksource read() method internal to the timekeeping code. The added function timekeeping_cycles_to_ns() calculated a nanosecond value from a cycle count that can be added to tk_read_base.base value yielding the current system time. This allows clocksource cycle values external to the timekeeping code to provide a cycle count that can be transformed to system time. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4ff237dbc006..34c5b6351cd5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** -- cgit v1.2.3 From 78c7b55b362e868e529ab6579134708fcf5539dd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Oct 2016 19:55:48 -0700 Subject: timekeeping: Fix __ktime_get_fast_ns() regression commit 58bfea9532552d422bde7afa207e1a0f08dffa7d upstream. In commit 27727df240c7 ("Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING"), I changed the logic to open-code the timekeeping_get_ns() function, but I forgot to include the unit conversion from cycles to nanoseconds, breaking the function's output, which impacts users like perf. This results in bogus perf timestamps like: swapper 0 [000] 253.427536: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426573: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426687: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426800: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426905: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427022: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427127: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427239: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427346: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427463: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 255.426572: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) Instead of more reasonable expected timestamps like: swapper 0 [000] 39.953768: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.064839: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.175956: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.287103: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.398217: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.509324: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.620437: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.731546: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.842654: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.953772: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 41.064881: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) Add the proper use of timekeeping_delta_to_ns() to convert the cycle delta to nanoseconds as needed. Thanks to Brendan and Alexei for finding this quickly after the v4.8 release. Unfortunately the problematic commit has landed in some -stable trees so they'll need this fix as well. Many apologies for this mistake. I'll be looking to add a perf-clock sanity test to the kselftest timers tests soon. Fixes: 27727df240c7 "timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING" Reported-by: Brendan Gregg Reported-by: Alexei Starovoitov Tested-and-reviewed-by: Mathieu Desnoyers Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1475636148-26539-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Cc: Mathieu Desnoyers Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34c5b6351cd5..445601c580d6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -402,8 +402,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += clocksource_delta(tkr->read(tkr->clock), - tkr->cycle_last, tkr->mask); + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tkr->read(tkr->clock), + tkr->cycle_last, + tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; -- cgit v1.2.3 From 6ddbd662d0bac4e4cb998b4c1fa44cd7c679c3cb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 15:53:38 -0700 Subject: CHROMIUM: remove Android's cgroup generic permissions checks The implementation is utterly broken, resulting in all processes being allows to move tasks between sets (as long as they have access to the "tasks" attribute), and upstream is heading towards checking only capability anyway, so let's get rid of this code. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: I2f780a5992c34e52a8f2d0b3557fc9d490da2779 Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394967 Reviewed-by: Ricky Zhou Reviewed-by: John Stultz --- kernel/cgroup.c | 59 ++--------------------------------------------------- kernel/sched/core.c | 1 - 2 files changed, 2 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f53e61f95b55..fec27295a1ec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2671,45 +2671,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - if (capable(CAP_SYS_NICE)) - return 0; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } - - return 0; -} - -static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - int i; - int ret; - - for_each_css(css, i, cgrp) { - if (css->ss->allow_attach) { - ret = css->ss->allow_attach(tset); - if (ret) - return ret; - } else { - return -EACCES; - } - } - - return 0; -} - static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2724,24 +2685,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - /* - * if the default permission check fails, give each - * cgroup a chance to extend the permission check - */ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct css_set *cset; - cset = task_css_set(task); - list_add(&cset->mg_node, &tset.src_csets); - ret = cgroup_allow_attach(dst_cgrp, &tset); - list_del(&tset.src_csets); - if (ret) - ret = -EACCES; - } + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 01cb249109cc..1df6da0094f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8956,7 +8956,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; -- cgit v1.2.3 From e129860f82d8484528e51619cc0d55434cbd14bd Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 16:14:16 -0700 Subject: CHROMIUM: cgroups: relax permissions on moving tasks between cgroups Android expects system_server to be able to move tasks between different cgroups/cpusets, but does not want to be running as root. Let's relax permission check so that processes can move other tasks if they have CAP_SYS_NICE in the affected task's user namespace. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: Ia919c66ab6ed6a6daf7c4cf67feb38b13b1ad09b Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394927 Reviewed-by: Ricky Zhou --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fec27295a1ec..fcb037068e3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2685,7 +2685,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { -- cgit v1.2.3 From 172724332ad7df4ac9fa1b2f6d4d96d80175bec6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 18 Oct 2016 12:35:03 -0700 Subject: cgroup: Remove leftover instances of allow_attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: kernel/sched/tune.c:718:2: error: unknown field ‘allow_attach’ specified in initializer kernel/cpuset.c:2087:2: error: unknown field 'allow_attach' specified in initializer Change-Id: Ie524350ffc6158f3182d90095cca502e58b6f197 Fixes: e78f134a78a0 ("CHROMIUM: remove Android's cgroup generic permissions checks") Signed-off-by: Guenter Roeck --- kernel/cpuset.c | 18 ------------------ kernel/sched/tune.c | 7 ------- 2 files changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 85737aada4d2..3f9db31c5d04 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2095,23 +2095,6 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } -static int cpuset_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if ((current != task) && !capable(CAP_SYS_ADMIN) && - cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) - return -EACCES; - } - - return 0; -} - /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the @@ -2132,7 +2115,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, - .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .post_attach = cpuset_post_attach, diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 505d7b35b0e1..68a24a044b0a 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -368,12 +368,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_allow_attach(struct cgroup_taskset *tset) -{ - /* We always allows tasks to be moved between existing CGroups */ - return 0; -} - int schedtune_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -715,7 +709,6 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, - .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, -- cgit v1.2.3 From 9d37de65aae1e8783547445f17a1d2b72b94e3ce Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Thu, 9 Jun 2016 23:43:28 -0700 Subject: UPSTREAM: cpu/hotplug: Handle unbalanced hotplug enable/disable (cherry picked from commit 01b41159066531cc8d664362ff0cd89dd137bbfa) When cpu_hotplug_enable() is called unbalanced w/o a preceeding cpu_hotplug_disable() the code emits a warning, but happily decrements the disabled counter. This causes the next operations to malfunction. Prevent the decrement and just emit a warning. Signed-off-by: Lianwei Wang Cc: peterz@infradead.org Cc: linux-pm@vger.kernel.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ced7c751648..c8a1751be224 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -185,10 +185,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -631,7 +638,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; -- cgit v1.2.3 From 273daee0be36235886622396eedb618fc5de0213 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 18 Oct 2016 16:20:23 -0700 Subject: cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions Try to better match what we're pushing upstream, use CAP_SYS_RESOURCE instead of CAP_SYS_NICE, which shoudln't affect Android as Zygote and system_server already use CAP_SYS_RESOURCE. Signed-off-by: John Stultz --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fcb037068e3f..e4552a3cbf41 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2686,7 +2686,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_NICE)) + !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { -- cgit v1.2.3 From f2c4508a35a1e4aba0f910ba41c7001bb7801cfe Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Mon, 1 Aug 2016 16:27:38 +0200 Subject: genirq/generic_chip: Add irq_unmap callback commit ee26c013cdee0b947e29d6cadfb9ff3341c69ff9 upstream. Without this patch irq_domain_disassociate() cannot properly release the interrupt. In fact, irq_map_generic_chip() checks a bit on 'gc->installed' but said bit is never cleared, only set. Commit 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") added irq_map_generic_chip() function and also stated "This lacks a removal function for now". This commit provides an implementation of an unmap function that can be called by irq_domain_disassociate(). [ tglx: Made the function static and removed the export as we have neither a prototype nor a modular user. ] Fixes: 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/579F5C5A.2070507@laposte.net Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/generic-chip.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..a4775f3451b9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, } EXPORT_SYMBOL_GPL(irq_map_generic_chip); +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} + struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); -- cgit v1.2.3 From 4d4fe8f2b27ceb29c16b32c3882ebd19bf21800f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 27 Apr 2016 17:47:11 +0100 Subject: PM / Hibernate: Call flush_icache_range() on pages restored in-place Some architectures require code written to memory as if it were data to be 'cleaned' from any data caches before the processor can fetch them as new instructions. During resume from hibernate, the snapshot code copies some pages directly, meaning these architectures do not get a chance to perform their cache maintenance. Modify the read and decompress code to call flush_icache_range() on all pages that are restored, so that the restored in-place pages are guaranteed to be executable on these architectures. Signed-off-by: James Morse Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Acked-by: Catalin Marinas [will: make clean_pages_on_* static and remove initialisers] Signed-off-by: Will Deacon (cherry picked from commit f6cf0545ec697ddc278b7457b7d0c0d86a2ea88e) Signed-off-by: Alex Shi --- kernel/power/swap.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 12cd989dadf6..160e1006640d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -36,6 +36,14 @@ #define HIBERNATE_SIG "S1SUSPEND" +/* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page @@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio) if (bio_data_dir(bio) == WRITE) put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_error && !hb->error) hb->error = bio->bi_error; @@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); + clean_pages_on_read = true; printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data) d->unc_len = LZO_UNC_SIZE; d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, d->unc, &d->unc_len); + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + atomic_set(&d->stop, 1); wake_up(&d->done); } @@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle, } memset(crc, 0, offsetof(struct crc_data, go)); + clean_pages_on_decompress = true; + /* * Start the decompression threads. */ -- cgit v1.2.3 From 3eb846e0d5ebc563d111ea825e55ea1b731b1a8b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 2 Jan 2016 03:09:16 +0100 Subject: PM / sleep: Add support for read-only sysfs attributes Some sysfs attributes in /sys/power/ should really be read-only, so add support for that, convert those attributes to read-only and drop the stub .show() routines from them. Original-by: Sergey Senozhatsky Signed-off-by: Rafael J. Wysocki (cherry picked from commit a1e9ca6967d68209c70e616a224efa89a6b86ca6) Signed-off-by: Alex Shi --- kernel/power/main.c | 17 ++--------------- kernel/power/power.h | 9 +++++++++ 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..27946975eff0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..efe1b3b17c88 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ -- cgit v1.2.3 From 603c78000f8c9e813f9399619c97be105c820585 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Mar 2016 00:21:06 +0100 Subject: cgroup: avoid false positive gcc-6 warning commit cfe02a8a973e7e5f66926b8ae38dfce404b19e29 upstream. When all subsystems are disabled, gcc notices that cgroup_subsys_enabled_key is a zero-length array and that any access to it must be out of bounds: In file included from ../include/linux/cgroup.h:19:0, from ../kernel/cgroup.c:31: ../kernel/cgroup.c: In function 'cgroup_add_cftypes': ../kernel/cgroup.c:261:53: error: array subscript is above array bounds [-Werror=array-bounds] return static_key_enabled(cgroup_subsys_enabled_key[ssid]); ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ ../include/linux/jump_label.h:271:40: note: in definition of macro 'static_key_enabled' static_key_count((struct static_key *)x) > 0; \ ^ We should never call the function in this particular case, so this is not a bug. In order to silence the warning, this adds an explicit check for the CGROUP_SUBSYS_COUNT==0 case. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a3424f28aaf4..127c63e02d52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -236,6 +236,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -- cgit v1.2.3 From 469fcbcb84d809ec05567d51f4fb664b894517e0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Nov 2016 11:49:56 +0100 Subject: PM / sleep: fix device reference leak in test_suspend commit ceb75787bc75d0a7b88519ab8a68067ac690f55a upstream. Make sure to drop the reference taken by class_find_device() after opening the RTC device. Fixes: 77437fd4e61f (pm: boot time suspend selftest) Signed-off-by: Johan Hovold Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/suspend_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; -- cgit v1.2.3 From cd5367ae02a488450b714a5d5f47afb014ea6541 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Nov 2015 11:13:34 -0500 Subject: cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type With major controllers - cpu, memory and io - shaping up for the unified hierarchy, cgroup2 is about ready to be, gradually, released into the wild. Replace __DEVEL__sane_behavior flag which was used to select the unified hierarchy with a separate filesystem type "cgroup2" so that unified hierarchy can be mounted as follows. mount -t cgroup2 none $MOUNT_POINT The cgroup2 fs has its own magic number - 0x63677270 ("cgrp"). v2: Assign a different magic number to cgroup2 fs. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner (cherry picked from commit 67e9c74b8a873408c27ac9a8e4c1d1c8d72c93ff) Signed-off-by: Alex Shi --- kernel/cgroup.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 127c63e02d52..b5946676f84e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -1650,10 +1651,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1720,15 +1717,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -2007,6 +1995,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; @@ -2023,6 +2012,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2030,15 +2030,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2149,9 +2140,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2194,6 +2186,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -5383,6 +5381,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; -- cgit v1.2.3 From d4a5b037e0af89c315c16aeee941799ae0ff9ced Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 11:58:52 +0530 Subject: cpufreq: sched: Fix kernel crash on accessing sysfs file If the cpufreq driver hasn't set the CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag, then the kernel will crash on accessing sysfs files for the sched governor. CPUFreq governors we can have the governor specific sysfs files in two places: A. /sys/devices/system/cpu/cpuX/cpufreq/ B. /sys/devices/system/cpu/cpufreq/ The case A. is for governor per policy case, where we can control the governor tunables for each policy separately. The case B. is for system wide tunable values. The schedfreq governor only implements the case A. and not B. The sysfs files in case B will still be present in /sys/devices/system/cpu/cpufreq/, but accessing them will crash kernel as the governor doesn't support that. Moreover the sched governor is pretty new and will be used only for the ARM platforms and there is no need to support the case B at all. Hence use policy->kobj instead of get_governor_parent_kobj(), so that we always create the sysfs files in path A. Signed-off-by: Viresh Kumar --- kernel/sched/cpufreq_sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f6f9b9b3a4a8..d751bc2d0d6e 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); goto err; @@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + sysfs_remove_group(&policy->kobj, get_sysfs_attr()); policy->governor_data = NULL; -- cgit v1.2.3 From bcddfb47bf5d54c3b312e165374bed0317d5a767 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:22 -0800 Subject: UPSTREAM: timekeeping: Add a fast and NMI safe boot clock This boot clock can be used as a tracing clock and will account for suspend time. To keep it NMI safe since we're accessing from tracing, we're not using a separate timekeeper with updates to monotonic clock and boot offset protected with seqlocks. This has the following minor side effects: (1) Its possible that a timestamp be taken after the boot offset is updated but before the timekeeper is updated. If this happens, the new boot offset is added to the old timekeeping making the clock appear to update slightly earlier: CPU 0 CPU 1 timekeeping_inject_sleeptime64() __timekeeping_inject_sleeptime(tk, delta); timestamp(); timekeeping_update(tk, TK_CLEAR_NTP...); (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be partially updated. Since the tk->offs_boot update is a rare event, this should be a rare occurrence which postprocessing should be able to handle. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 445601c580d6..ede4bf13d3e9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -424,6 +424,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; -- cgit v1.2.3 From 789790d859146cc223b101efb14d51929051ff89 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:23 -0800 Subject: UPSTREAM: trace: Add an option for boot clock as trace clock Unlike monotonic clock, boot clock as a trace clock will account for time spent in suspend useful for tracing suspend/resume. This uses earlier introduced infrastructure for using the fast boot clock. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Acked-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bff7d4c69274..293af3346c8c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -890,6 +890,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3 From a8575ee86e3ec89dde405193d61a6c0a958308cf Mon Sep 17 00:00:00 2001 From: Ke Wang Date: Fri, 25 Nov 2016 13:38:45 +0800 Subject: sched: tune: Fix lacking spinlock initialization The spinlock used by boost_groups in sched tune must be initialized. This commit fixes this lack and the following errors: [ 0.384739] c2 BUG: spinlock bad magic on CPU#2, swapper/2/0 [ 0.390313] c2 lock: 0xffffffc15fe1fc80, .magic:00000000, .owner: /-1, .owner_cpu: 0 [ 0.398739] c2 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.4.6+ #4 [ 0.404816] c2 Hardware name: Spreadtrum SP9860gBoard (DT) [ 0.410462] c2 Call trace: [ 0.413159] c2 [] dump_backtrace+0x0/0x210 [ 0.418803] c2 [] show_stack+0x20/0x28 [ 0.424100] c2 [] dump_stack+0xa8/0xe0 [ 0.429398] c2 [] spin_dump+0x78/0x9c [ 0.434608] c2 [] spin_bug+0x30/0x3c [ 0.439644] c2 [] do_raw_spin_lock+0xac/0x1b4 [ 0.445639] c2 [] _raw_spin_lock_irqsave+0x58/0x68 [ 0.451977] c2 [] schedtune_enqueue_task+0x84/0x3bc [ 0.458320] c2 [] enqueue_task_fair+0x438/0x208c [ 0.464487] c2 [] activate_task+0x70/0xd0 [ 0.470130] c2 [] ttwu_do_activate.constprop.131+0x4c/0x98 [ 0.477079] c2 [] try_to_wake_up+0x254/0x54c [ 0.482899] c2 [] default_wake_function+0x30/0x3c [ 0.489154] c2 [] autoremove_wake_function+0x3c/0x6c [ 0.495754] c2 [] __wake_up_common+0x64/0xa4 [ 0.501574] c2 [] __wake_up+0x48/0x60 [ 0.506788] c2 [] rcu_gp_kthread_wake+0x50/0x5c [ 0.512866] c2 [] note_gp_changes+0xac/0xd4 [ 0.518597] c2 [] rcu_process_callbacks+0xe8/0x93c [ 0.524940] c2 [] __do_softirq+0x24c/0x5b8 [ 0.530584] c2 [] irq_exit+0xc0/0xec [ 0.535623] c2 [] __handle_domain_irq+0x94/0xf8 [ 0.541789] c2 [] gic_handle_irq+0x64/0xc0 Signed-off-by: Ke Wang --- kernel/sched/tune.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 68a24a044b0a..079b18802f17 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -725,6 +725,7 @@ schedtune_init_cgroups(void) for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); + raw_spin_lock_init(&bg->lock); } pr_info("schedtune: configured to support %d boost groups\n", -- cgit v1.2.3 From dfb704f96cd190239e7b86b09810a136cf25506e Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Wed, 15 Jun 2016 15:27:36 +0800 Subject: rcu: Fix soft lockup for rcu_nocb_kthread commit bedc1969150d480c462cdac320fa944b694a7162 upstream. Carrying out the following steps results in a softlockup in the RCU callback-offload (rcuo) kthreads: 1. Connect to ixgbevf, and set the speed to 10Gb/s. 2. Use ifconfig to bring the nic up and down repeatedly. [ 317.005148] IPv6: ADDRCONF(NETDEV_CHANGE): eth2: link becomes ready [ 368.106005] BUG: soft lockup - CPU#1 stuck for 22s! [rcuos/1:15] [ 368.106005] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 368.106005] task: ffff88057dd8a220 ti: ffff88057dd9c000 task.ti: ffff88057dd9c000 [ 368.106005] RIP: 0010:[] [] fib_table_lookup+0x14/0x390 [ 368.106005] RSP: 0018:ffff88061fc83ce8 EFLAGS: 00000286 [ 368.106005] RAX: 0000000000000001 RBX: 00000000020155c0 RCX: 0000000000000001 [ 368.106005] RDX: ffff88061fc83d50 RSI: ffff88061fc83d70 RDI: ffff880036d11a00 [ 368.106005] RBP: ffff88061fc83d08 R08: 0000000000000001 R09: 0000000000000000 [ 368.106005] R10: ffff880036d11a00 R11: ffffffff819e0900 R12: ffff88061fc83c58 [ 368.106005] R13: ffffffff816154dd R14: ffff88061fc83d08 R15: 00000000020155c0 [ 368.106005] FS: 0000000000000000(0000) GS:ffff88061fc80000(0000) knlGS:0000000000000000 [ 368.106005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 368.106005] CR2: 00007f8c2aee9c40 CR3: 000000057b222000 CR4: 00000000000407e0 [ 368.106005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 368.106005] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 368.106005] Stack: [ 368.106005] 00000000010000c0 ffff88057b766000 ffff8802e380b000 ffff88057af03e00 [ 368.106005] ffff88061fc83dc0 ffffffff815349a6 ffff88061fc83d40 ffffffff814ee146 [ 368.106005] ffff8802e380af00 00000000e380af00 ffffffff819e0900 020155c0010000c0 [ 368.106005] Call Trace: [ 368.106005] [ 368.106005] [ 368.106005] [] ip_route_input_noref+0x516/0xbd0 [ 368.106005] [] ? skb_release_data+0xd6/0x110 [ 368.106005] [] ? kfree_skb+0x3a/0xa0 [ 368.106005] [] ip_rcv_finish+0x29f/0x350 [ 368.106005] [] ip_rcv+0x234/0x380 [ 368.106005] [] __netif_receive_skb_core+0x676/0x870 [ 368.106005] [] __netif_receive_skb+0x18/0x60 [ 368.106005] [] process_backlog+0xae/0x180 [ 368.106005] [] net_rx_action+0x152/0x240 [ 368.106005] [] __do_softirq+0xef/0x280 [ 368.106005] [] call_softirq+0x1c/0x30 [ 368.106005] [ 368.106005] [ 368.106005] [] do_softirq+0x65/0xa0 [ 368.106005] [] local_bh_enable+0x94/0xa0 [ 368.106005] [] rcu_nocb_kthread+0x232/0x370 [ 368.106005] [] ? wake_up_bit+0x30/0x30 [ 368.106005] [] ? rcu_start_gp+0x40/0x40 [ 368.106005] [] kthread+0xcf/0xe0 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 [ 368.106005] [] ret_from_fork+0x58/0x90 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 ==================================cut here============================== It turns out that the rcuos callback-offload kthread is busy processing a very large quantity of RCU callbacks, and it is not reliquishing the CPU while doing so. This commit therefore adds an cond_resched_rcu_qs() within the loop to allow other tasks to run. Signed-off-by: Ding Tianhong [ paulmck: Substituted cond_resched_rcu_qs for cond_resched. ] Signed-off-by: Paul E. McKenney Cc: Dhaval Giani Signed-off-by: Greg Kroah-Hartman --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..32cbe72bf545 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2275,6 +2275,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); -- cgit v1.2.3 From b27d9147f24a501d632916964f77c1221246fae9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:41 +0000 Subject: locking/rtmutex: Prevent dequeue vs. unlock race commit dbb26055defd03d59f678cb5f2c992abe05b064a upstream. David reported a futex/rtmutex state corruption. It's caused by the following problem: CPU0 CPU1 CPU2 l->owner=T1 rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T2) boost() unlock(l->wait_lock) schedule() rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T3) boost() unlock(l->wait_lock) schedule() signal(->T2) signal(->T3) lock(l->wait_lock) dequeue(T2) deboost() unlock(l->wait_lock) lock(l->wait_lock) dequeue(T3) ===> wait list is now empty deboost() unlock(l->wait_lock) lock(l->wait_lock) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; l->owner = owner ==> l->owner = T1 } lock(l->wait_lock) rt_mutex_unlock(l) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; cmpxchg(l->owner, T1, NULL) ===> Success (l->owner = NULL) l->owner = owner ==> l->owner = T1 } That means the problem is caused by fixup_rt_mutex_waiters() which does the RMW to clear the waiters bit unconditionally when there are no waiters in the rtmutexes rbtree. This can be fatal: A concurrent unlock can release the rtmutex in the fastpath because the waiters bit is not set. If the cmpxchg() gets in the middle of the RMW operation then the previous owner, which just unlocked the rtmutex is set as the owner again when the write takes place after the successfull cmpxchg(). The solution is rather trivial: verify that the owner member of the rtmutex has the waiters bit set before clearing it. This does not require a cmpxchg() or other atomic operations because the waiters bit can only be set and cleared with the rtmutex wait_lock held. It's also safe against the fast path unlock attempt. The unlock attempt via cmpxchg() will either see the bit set and take the slowpath or see the bit cleared and release it atomically in the fastpath. It's remarkable that the test program provided by David triggers on ARM64 and MIPS64 really quick, but it refuses to reproduce on x86-64, while the problem exists there as well. That refusal might explain that this got not discovered earlier despite the bug existing from day one of the rtmutex implementation more than 10 years ago. Thanks to David for meticulously instrumenting the code and providing the information which allowed to decode this subtle problem. Reported-by: David Daney Tested-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Will Deacon Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") Link: http://lkml.kernel.org/r/20161130210030.351136722@linutronix.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/locking/rtmutex.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..b066724d7a5b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); + unsigned long owner, *p = (unsigned long *) &lock->owner; + + if (rt_mutex_has_waiters(lock)) + return; + + /* + * The rbtree has no waiters enqueued, now make sure that the + * lock->owner still has the waiters bit set, otherwise the + * following can happen: + * + * CPU 0 CPU 1 CPU2 + * l->owner=T1 + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T2) + * boost() + * unlock(l->lock) + * block() + * + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T3) + * boost() + * unlock(l->lock) + * block() + * signal(->T2) signal(->T3) + * lock(l->lock) + * dequeue(T2) + * deboost() + * unlock(l->lock) + * lock(l->lock) + * dequeue(T3) + * ==> wait list is empty + * deboost() + * unlock(l->lock) + * lock(l->lock) + * fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * l->owner = owner + * owner = l->owner & ~HAS_WAITERS; + * ==> l->owner = T1 + * } + * lock(l->lock) + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * owner = l->owner & ~HAS_WAITERS; + * cmpxchg(l->owner, T1, NULL) + * ===> Success (l->owner = NULL) + * + * l->owner = owner + * ==> l->owner = T1 + * } + * + * With the check for the waiter bit in place T3 on CPU2 will not + * overwrite. All tasks fiddling with the waiters bit are + * serialized by l->lock, so nothing else can modify the waiters + * bit. If the bit is set then nothing can change l->owner either + * so the simple RMW is safe. The cmpxchg() will simply fail if it + * happens in the middle of the RMW because the waiters bit is + * still set. + */ + owner = READ_ONCE(*p); + if (owner & RT_MUTEX_HAS_WAITERS) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } /* -- cgit v1.2.3 From c6a5bf4cda12e4a3c097036ce7044ee65e4cf6d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:42 +0000 Subject: locking/rtmutex: Use READ_ONCE() in rt_mutex_owner() commit 1be5d4fa0af34fb7bafa205aeb59f5c7cc7a089d upstream. While debugging the rtmutex unlock vs. dequeue race Will suggested to use READ_ONCE() in rt_mutex_owner() as it might race against the cmpxchg_release() in unlock_rt_mutex_safe(). Will: "It's a minor thing which will most likely not matter in practice" Careful search did not unearth an actual problem in todays code, but it's better to be safe than surprised. Suggested-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20161130210030.431379999@linutronix.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/locking/rtmutex_common.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..e317e1cbb3eb 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p) static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); } /* -- cgit v1.2.3 From 1c0f4e0ebb791cad8fee77d8de1ceb684e631698 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 7 Dec 2016 14:54:38 +0100 Subject: hotplug: Make register and unregister notifier API symmetric commit 777c6e0daebb3fcefbbd6f620410a946b07ef6d0 upstream. Yu Zhao has noticed that __unregister_cpu_notifier only unregisters its notifiers when HOTPLUG_CPU=y while the registration might succeed even when HOTPLUG_CPU=n if MODULE is enabled. This means that e.g. zswap might keep a stale notifier on the list on the manual clean up during the pool tear down and thus corrupt the list. Resulting in the following [ 144.964346] BUG: unable to handle kernel paging request at ffff880658a2be78 [ 144.971337] IP: [] raw_notifier_chain_register+0x1b/0x40 [ 145.122628] Call Trace: [ 145.125086] [] __register_cpu_notifier+0x18/0x20 [ 145.131350] [] zswap_pool_create+0x273/0x400 [ 145.137268] [] __zswap_param_set+0x1fc/0x300 [ 145.143188] [] ? trace_hardirqs_on+0xd/0x10 [ 145.149018] [] ? kernel_param_lock+0x28/0x30 [ 145.154940] [] ? __might_fault+0x4f/0xa0 [ 145.160511] [] zswap_compressor_param_set+0x17/0x20 [ 145.167035] [] param_attr_store+0x5c/0xb0 [ 145.172694] [] module_attr_store+0x1d/0x30 [ 145.178443] [] sysfs_kf_write+0x4f/0x70 [ 145.183925] [] kernfs_fop_write+0x149/0x180 [ 145.189761] [] __vfs_write+0x18/0x40 [ 145.194982] [] vfs_write+0xb2/0x1a0 [ 145.200122] [] SyS_write+0x52/0xa0 [ 145.205177] [] entry_SYSCALL_64_fastpath+0x12/0x17 This can be even triggered manually by changing /sys/module/zswap/parameters/compressor multiple times. Fix this issue by making unregister APIs symmetric to the register so there are no surprises. Fixes: 47e627bc8c9a ("[PATCH] hotplug: Allow modules to use the cpu hotplug notifiers even if !CONFIG_HOTPLUG_CPU") Reported-and-tested-by: Yu Zhao Signed-off-by: Michal Hocko Cc: linux-mm@kvack.org Cc: Andrew Morton Cc: Dan Streetman Link: http://lkml.kernel.org/r/20161207135438.4310-1-mhocko@kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..cd6d1258554e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -223,8 +223,6 @@ static int cpu_notify(unsigned long val, void *v) return __cpu_notify(val, v, -1, NULL); } -#ifdef CONFIG_HOTPLUG_CPU - static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); @@ -246,6 +244,7 @@ void __unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(__unregister_cpu_notifier); +#ifdef CONFIG_HOTPLUG_CPU /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id -- cgit v1.2.3 From ede03b5d80a61a01217635de00b805c53d87e474 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 6 Dec 2016 11:50:53 +0000 Subject: sched/walt: kill {min,max}_capacity {min,max}_capacity are static variables that are only updated from __update_min_max_capacity(), but not used anywhere else. Remove them together with the function updating them. This has also the nice side effect of fixing a LOCKDEP warning related to locking all CPUs in update_min_max_capacity(), as reported by Ke Wang: [ 2.853595] c0 ============================================= [ 2.859219] c0 [ INFO: possible recursive locking detected ] [ 2.864852] c0 4.4.6+ #5 Tainted: G W [ 2.869604] c0 --------------------------------------------- [ 2.875230] c0 swapper/0/1 is trying to acquire lock: [ 2.880248] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.888815] c0 [ 2.888815] c0 but task is already holding lock: [ 2.895132] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.903700] c0 [ 2.903700] c0 other info that might help us debug this: [ 2.910710] c0 Possible unsafe locking scenario: [ 2.910710] c0 [ 2.917112] c0 CPU0 [ 2.919795] c0 ---- [ 2.922478] lock(&rq->lock); [ 2.925507] lock(&rq->lock); [ 2.928536] c0 [ 2.928536] c0 *** DEADLOCK *** [ 2.928536] c0 [ 2.935200] c0 May be due to missing lock nesting notation [ 2.935200] c0 [ 2.942471] c0 7 locks held by swapper/0/1: [ 2.946623] #0: (&dev->mutex){......}, at: [] __driver_attach+0x64/0xb8 [ 2.954931] #1: (&dev->mutex){......}, at: [] __driver_attach+0x74/0xb8 [ 2.963239] #2: (cpu_hotplug.lock){++++++}, at: [] get_online_cpus+0x48/0xa8 [ 2.971979] #3: (subsys mutex#6){+.+.+.}, at: [] subsys_interface_register+0x44/0xc0 [ 2.981411] #4: (&policy->rwsem){+.+.+.}, at: [] cpufreq_online+0x330/0x76c [ 2.990065] #5: ((cpufreq_policy_notifier_list).rwsem){.+.+..}, at: [] blocking_notifier_call_chain+0x38/0xc4 [ 3.001661] #6: (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.010661] c0 [ 3.010661] c0 stack backtrace: [ 3.015514] c0 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.4.6+ #5 [ 3.022864] c0 Hardware name: Spreadtrum SP9860g Board (DT) [ 3.028402] c0 Call trace: [ 3.031092] c0 [] dump_backtrace+0x0/0x210 [ 3.036716] c0 [] show_stack+0x20/0x28 [ 3.041994] c0 [] dump_stack+0xa8/0xe0 [ 3.047273] c0 [] __lock_acquire+0x1e0c/0x2218 [ 3.053243] c0 [] lock_acquire+0xe0/0x280 [ 3.058784] c0 [] _raw_spin_lock+0x44/0x58 [ 3.064407] c0 [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.070983] c0 [] blocking_notifier_call_chain+0x78/0xc4 [ 3.077820] c0 [] cpufreq_online+0x28c/0x76c [ 3.083618] c0 [] cpufreq_add_dev+0x98/0xdc [ 3.089331] c0 [] subsys_interface_register+0x84/0xc0 [ 3.095907] c0 [] cpufreq_register_driver+0x168/0x28c [ 3.102486] c0 [] sprd_cpufreq_probe+0x134/0x19c [ 3.108629] c0 [] platform_drv_probe+0x58/0xd0 [ 3.114599] c0 [] driver_probe_device+0x1e8/0x470 [ 3.120830] c0 [] __driver_attach+0xb4/0xb8 [ 3.126541] c0 [] bus_for_each_dev+0x6c/0xac [ 3.132339] c0 [] driver_attach+0x2c/0x34 [ 3.137877] c0 [] bus_add_driver+0x210/0x298 [ 3.143676] c0 [] driver_register+0x7c/0x114 [ 3.149476] c0 [] __platform_driver_register+0x60/0x6c [ 3.156139] c0 [] sprd_cpufreq_platdrv_init+0x18/0x20 [ 3.162714] c0 [] do_one_initcall+0xd0/0x1d8 [ 3.168514] c0 [] kernel_init_freeable+0x1fc/0x29c [ 3.174834] c0 [] kernel_init+0x20/0x12c [ 3.180281] c0 [] ret_from_fork+0x10/0x40 Reported-by: Ke Wang Signed-off-by: Juri Lelli --- kernel/sched/walt.c | 45 +-------------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 2ffb1680b380..6e053bd9830c 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -62,8 +62,6 @@ static unsigned int max_possible_freq = 1; */ static unsigned int min_max_freq = 1; -static unsigned int max_capacity = 1024; -static unsigned int min_capacity = 1024; static unsigned int max_load_scale_factor = 1024; static unsigned int max_possible_capacity = 1024; @@ -869,39 +867,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) double_rq_unlock(src_rq, dest_rq); } -/* Keep track of max/min capacity possible across CPUs "currently" */ -static void __update_min_max_capacity(void) -{ - int i; - int max = 0, min = INT_MAX; - - for_each_online_cpu(i) { - if (cpu_rq(i)->capacity > max) - max = cpu_rq(i)->capacity; - if (cpu_rq(i)->capacity < min) - min = cpu_rq(i)->capacity; - } - - max_capacity = max; - min_capacity = min; -} - -static void update_min_max_capacity(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - __update_min_max_capacity(); - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - local_irq_restore(flags); -} - /* * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that * least efficient cpu gets capacity of 1024 @@ -984,15 +949,9 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, /* Initialized to policy->max in case policy->related_cpus is empty! */ unsigned int orig_max_freq = policy->max; - if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && - val != CPUFREQ_CREATE_POLICY) + if (val != CPUFREQ_NOTIFY) return 0; - if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { - update_min_max_capacity(); - return 0; - } - for_each_cpu(i, policy->related_cpus) { cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, policy->related_cpus); @@ -1082,8 +1041,6 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, max_load_scale_factor = highest_mplsf; } - __update_min_max_capacity(); - return 0; } -- cgit v1.2.3 From 03eed7afbc09e061f66b448daf7863174c3dc3f3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Oct 2016 21:23:16 -0500 Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks commit bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 upstream. During exec dumpable is cleared if the file that is being executed is not readable by the user executing the file. A bug in ptrace_may_access allows reading the file if the executable happens to enter into a subordinate user namespace (aka clone(CLONE_NEWUSER), unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER). This problem is fixed with only necessary userspace breakage by adding a user namespace owner to mm_struct, captured at the time of exec, so it is clear in which user namespace CAP_SYS_PTRACE must be present in to be able to safely give read permission to the executable. The function ptrace_may_access is modified to verify that the ptracer has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns. This ensures that if the task changes it's cred into a subordinate user namespace it does not become ptraceable. The function ptrace_attach is modified to only set PT_PTRACE_CAP when CAP_SYS_PTRACE is held over task->mm->user_ns. The intent of PT_PTRACE_CAP is to be a flag to note that whatever permission changes the task might go through the tracer has sufficient permissions for it not to be an issue. task->cred->user_ns is always the same as or descendent of mm->user_ns. Which guarantees that having CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks credentials. To prevent regressions mm->dumpable and mm->user_ns are not considered when a task has no mm. As simply failing ptrace_may_attach causes regressions in privileged applications attempting to read things such as /proc//stat Acked-by: Kees Cook Tested-by: Cyrill Gorcunov Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 9 ++++++--- kernel/ptrace.c | 26 +++++++++++--------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7161ebe67cbb..2e55b53399de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,7 +585,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + struct user_namespace *user_ns) { mm->mmap = NULL; mm->mm_rb = RB_ROOT; @@ -625,6 +626,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) if (init_new_context(p, mm)) goto fail_nocontext; + mm->user_ns = get_user_ns(user_ns); return mm; fail_nocontext: @@ -670,7 +672,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current); + return mm_init(mm, current, current_user_ns()); } /* @@ -685,6 +687,7 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); + put_user_ns(mm->user_ns); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -942,7 +945,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk)) + if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 3189e51db7e8..21b60e0a3056 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,7 +219,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - int dumpable = 0; + struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -270,16 +270,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - rcu_read_lock(); - if (dumpable != SUID_DUMP_USER && - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); + mm = task->mm; + if (mm && + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptrace_has_cap(mm->user_ns, mode))) + return -EPERM; return security_ptrace_access_check(task, mode); } @@ -330,6 +325,11 @@ static int ptrace_attach(struct task_struct *task, long request, task_lock(task); retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (!retval) { + struct mm_struct *mm = task->mm; + if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) + flags |= PT_PTRACE_CAP; + } task_unlock(task); if (retval) goto unlock_creds; @@ -343,10 +343,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - rcu_read_lock(); - if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); -- cgit v1.2.3 From 1c1f15f8ebfbd5042883a1c9ae4b18a6299c9c5f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2016 18:48:07 -0600 Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP commit 64b875f7ac8a5d60a4e191479299e931ee949b67 upstream. When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was overlooked. This can result in incorrect behavior when an application like strace traces an exec of a setuid executable. Further PT_PTRACE_CAP does not have enough information for making good security decisions as it does not report which user namespace the capability is in. This has already allowed one mistake through insufficient granulariy. I found this issue when I was testing another corner case of exec and discovered that I could not get strace to set PT_PTRACE_CAP even when running strace as root with a full set of caps. This change fixes the above issue with strace allowing stracing as root a setuid executable without disabling setuid. More fundamentaly this change allows what is allowable at all times, by using the correct information in it's decision. Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/capability.c | 20 ++++++++++++++++++++ kernel/ptrace.c | 12 +++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 00411c82dac5..dfa0e4528b0b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -473,3 +473,23 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) kgid_has_mapping(ns, inode->i_gid); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); + +/** + * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace + * @tsk: The task that may be ptraced + * @ns: The user namespace to search for CAP_SYS_PTRACE in + * + * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE + * in the specified user namespace. + */ +bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) +{ + int ret = 0; /* An absent tracer adds no restrictions */ + const struct cred *cred; + rcu_read_lock(); + cred = rcu_dereference(tsk->ptracer_cred); + if (cred) + ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + rcu_read_unlock(); + return (ret == 0); +} diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 21b60e0a3056..a46c40bfb5f6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) BUG_ON(!list_empty(&child->ptrace_entry)); list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; + rcu_read_lock(); + child->ptracer_cred = get_cred(__task_cred(new_parent)); + rcu_read_unlock(); } /** @@ -71,11 +74,15 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) */ void __ptrace_unlink(struct task_struct *child) { + const struct cred *old_cred; BUG_ON(!child->ptrace); child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + old_cred = child->ptracer_cred; + child->ptracer_cred = NULL; + put_cred(old_cred); spin_lock(&child->sighand->siglock); @@ -325,11 +332,6 @@ static int ptrace_attach(struct task_struct *task, long request, task_lock(task); retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); - if (!retval) { - struct mm_struct *mm = task->mm; - if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - } task_unlock(task); if (retval) goto unlock_creds; -- cgit v1.2.3 From b35f34f66943fa1b8feadb037d054a141723f7fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 16 Nov 2016 22:06:51 -0600 Subject: exec: Ensure mm->user_ns contains the execed files commit f84df2a6f268de584a201e8911384a2d244876e3 upstream. When the user namespace support was merged the need to prevent ptrace from revealing the contents of an unreadable executable was overlooked. Correct this oversight by ensuring that the executed file or files are in mm->user_ns, by adjusting mm->user_ns. Use the new function privileged_wrt_inode_uidgid to see if the executable is a member of the user namespace, and as such if having CAP_SYS_PTRACE in the user namespace should allow tracing the executable. If not update mm->user_ns to the parent user namespace until an appropriate parent is found. Reported-by: Jann Horn Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/capability.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index dfa0e4528b0b..4984e1f552eb 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -456,6 +456,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, } EXPORT_SYMBOL(file_ns_capable); +/** + * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? + * @ns: The user namespace in question + * @inode: The inode in question + * + * Return true if the inode uid and gid are within the namespace. + */ +bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) +{ + return kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); +} + /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question @@ -469,8 +482,7 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && - kgid_has_mapping(ns, inode->i_gid); + return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); -- cgit v1.2.3 From f2b8b3455b22405237110d929f107318a535a480 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 14 Dec 2016 15:04:04 -0800 Subject: kernel/watchdog: use nmi registers snapshot in hardlockup handler commit 4d1f0fb096aedea7bb5489af93498a82e467c480 upstream. NMI handler doesn't call set_irq_regs(), it's set only by normal IRQ. Thus get_irq_regs() returns NULL or stale registers snapshot with IP/SP pointing to the code interrupted by IRQ which was interrupted by NMI. NULL isn't a problem: in this case watchdog calls dump_stack() and prints full stack trace including NMI. But if we're stuck in IRQ handler then NMI watchlog will print stack trace without IRQ part at all. This patch uses registers snapshot passed into NMI handler as arguments: these registers point exactly to the instruction interrupted by NMI. Fixes: 55537871ef66 ("kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup") Link: http://lkml.kernel.org/r/146771764784.86724.6006627197118544150.stgit@buzz Signed-off-by: Konstantin Khlebnikov Cc: Jiri Kosina Cc: Ulrich Obergfell Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/watchdog.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 198137b1cadc..c1e0b5f429b6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -328,7 +328,6 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); - struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) -- cgit v1.2.3 From f93777c915446c772761b7d8f6eb7ef49eca4e63 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 14 Dec 2016 15:05:49 -0800 Subject: kernel/debug/debug_core.c: more properly delay for secondary CPUs commit 2d13bb6494c807bcf3f78af0e96c0b8615a94385 upstream. We've got a delay loop waiting for secondary CPUs. That loop uses loops_per_jiffy. However, loops_per_jiffy doesn't actually mean how many tight loops make up a jiffy on all architectures. It is quite common to see things like this in the boot log: Calibrating delay loop (skipped), value calculated using timer frequency.. 48.00 BogoMIPS (lpj=24000) In my case I was seeing lots of cases where other CPUs timed out entering the debugger only to print their stack crawls shortly after the kdb> prompt was written. Elsewhere in kgdb we already use udelay(), so that should be safe enough to use to implement our timeout. We'll delay 1 ms for 1000 times, which should give us a full second of delay (just like the old code wanted) but allow us to notice that we're done every 1 ms. [akpm@linux-foundation.org: simplifications, per Daniel] Link: http://lkml.kernel.org/r/1477091361-2039-1-git-send-email-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Brian Norris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/debug/debug_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -598,11 +598,11 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - time_left = loops_per_jiffy * HZ; + time_left = MSEC_PER_SEC; while (kgdb_do_roundup && --time_left && (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != online_cpus) - cpu_relax(); + udelay(1000); if (!time_left) pr_crit("Timed out waiting for secondary CPUs.\n"); -- cgit v1.2.3 From e01b04be3eb00456d65278cfa56cfd8103872bfb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 8 Dec 2016 20:49:32 +0000 Subject: timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion commit 9c1645727b8fa90d07256fdfcc45bf831242a3ab upstream. The clocksource delta to nanoseconds conversion is using signed math, but the delta is unsigned. This makes the conversion space smaller than necessary and in case of a multiplication overflow the conversion can become negative. The conversion is done with scaled math: s64 nsec_delta = ((s64)clkdelta * clk->mult) >> clk->shift; Shifting a signed integer right obvioulsy preserves the sign, which has interesting consequences: - Time jumps backwards - __iter_div_u64_rem() which is used in one of the calling code pathes will take forever to piecewise calculate the seconds/nanoseconds part. This has been reported by several people with different scenarios: David observed that when stopping a VM with a debugger: "It was essentially the stopped by debugger case. I forget exactly why, but the guest was being explicitly stopped from outside, it wasn't just scheduling lag. I think it was something in the vicinity of 10 minutes stopped." When lifting the stop the machine went dead. The stopped by debugger case is not really interesting, but nevertheless it would be a good thing not to die completely. But this was also observed on a live system by Liav: "When the OS is too overloaded, delta will get a high enough value for the msb of the sum delta * tkr->mult + tkr->xtime_nsec to be set, and so after the shift the nsec variable will gain a value similar to 0xffffffffff000000." Unfortunately this has been reintroduced recently with commit 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation"). It had been fixed a year ago already in commit 35a4933a8959 ("time: Avoid signed overflow in timekeeping_get_ns()"). Though it's not surprising that the issue has been reintroduced because the function itself and the whole call chain uses s64 for the result and the propagation of it. The change in this recent commit is subtle: s64 nsec; - nsec = (d * m + n) >> s: + nsec = d * m + n; + nsec >>= s; d being type of cycle_t adds another level of obfuscation. This wouldn't have happened if the previous change to unsigned computation would have made the 'nsec' variable u64 right away and a follow up patch had cleaned up the whole call chain. There have been patches submitted which basically did a revert of the above patch leaving everything else unchanged as signed. Back to square one. This spawned a admittedly pointless discussion about potential users which rely on the unsigned behaviour until someone pointed out that it had been fixed before. The changelogs of said patches added further confusion as they made finally false claims about the consequences for eventual users which expect signed results. Despite delta being cycle_t, aka. u64, it's very well possible to hand in a signed negative value and the signed computation will happily return the correct result. But nobody actually sat down and analyzed the code which was added as user after the propably unintended signed conversion. Though in sensitive code like this it's better to analyze it proper and make sure that nothing relies on this than hunting the subtle wreckage half a year later. After analyzing all call chains it stands that no caller can hand in a negative value (which actually would work due to the s64 cast) and rely on the signed math to do the right thing. Change the conversion function to unsigned math. The conversion of all call chains is done in a follow up patch. This solves the starvation issue, which was caused by the negative result, but it does not solve the underlying problem. It merily procrastinates it. When the timekeeper update is deferred long enough that the unsigned multiplication overflows, then time going backwards is observable again. It does neither solve the issue of clocksources with a small counter width which will wrap around possibly several times and cause random time stamps to be generated. But those are usually not found on systems used for virtualization, so this is likely a non issue. I took the liberty to claim authorship for this simply because analyzing all callsites and writing the changelog took substantially more time than just making the simple s/s64/u64/ change and ignore the rest. Fixes: 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation") Reported-by: David Gibson Reported-by: Liav Rehana Signed-off-by: Thomas Gleixner Reviewed-by: David Gibson Acked-by: Peter Zijlstra (Intel) Cc: Parit Bhargava Cc: Laurent Vivier Cc: "Christopher S. Hall" Cc: Chris Metcalf Cc: Richard Cochran Cc: John Stultz Link: http://lkml.kernel.org/r/20161208204228.688545601@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 445601c580d6..738012d68117 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,10 +298,10 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, +static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, cycle_t delta) { - s64 nsec; + u64 nsec; nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; -- cgit v1.2.3 From e945df4c6bc2e3e188e02b199ce1766f557890e5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 Dec 2016 20:54:49 -0500 Subject: fgraph: Handle a case where a tracer ignores set_graph_notrace commit 794de08a16cf1fc1bf785dc48f66d36218cf6d88 upstream. Both the wakeup and irqsoff tracers can use the function graph tracer when the display-graph option is set. The problem is that they ignore the notrace file, and record the entry of functions that would be ignored by the function_graph tracer. This causes the trace->depth to be recorded into the ring buffer. The set_graph_notrace uses a trick by adding a large negative number to the trace->depth when a graph function is to be ignored. On trace output, the graph function uses the depth to record a stack of functions. But since the depth is negative, it accesses the array with a negative number and causes an out of bounds access that can cause a kernel oops or corrupt data. Have the print functions handle cases where a tracer still records functions even when they are in set_graph_notrace. Also add warnings if the depth is below zero before accessing the array. Note, the function graph logic will still prevent the return of these functions from being recorded, which means that they will be left hanging without a return. For example: # echo '*spin*' > set_graph_notrace # echo 1 > options/display-graph # echo wakeup > current_tracer # cat trace [...] _raw_spin_lock() { preempt_count_add() { do_raw_spin_lock() { update_rq_clock(); Where it should look like: _raw_spin_lock() { preempt_count_add(); do_raw_spin_lock(); } update_rq_clock(); Cc: Namhyung Kim Fixes: 29ad23b00474 ("ftrace: Add set_graph_notrace filter") Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions_graph.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..7fd6f5a26143 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -780,6 +780,10 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -788,7 +792,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->depth = call->depth - 1; /* No need to keep this function around for this depth */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = 0; } @@ -818,11 +823,16 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; /* Save this function pointer to see if the exit matches */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = call->func; } @@ -1052,7 +1062,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, */ cpu_data->depth = trace->depth - 1; - if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (trace->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(trace->depth < 0)) { if (cpu_data->enter_funcs[trace->depth] != trace->func) func_match = 0; cpu_data->enter_funcs[trace->depth] = 0; -- cgit v1.2.3 From 56ef587b77fddbc1b55a22253245a968cf7a1c58 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Jan 2017 11:47:50 +0100 Subject: stable-fixup: hotplug: fix unused function warning [resolves a messed up backport, so no matching upstream commit] The backport of upstream commit 777c6e0daebb ("hotplug: Make register and unregister notifier API symmetric") to linux-4.4.y introduced a harmless warning in 'allnoconfig' builds as spotted by kernelci.org: kernel/cpu.c:226:13: warning: 'cpu_notify_nofail' defined but not used [-Wunused-function] So far, this is the only stable tree that is affected, as linux-4.6 and higher contain commit 984581728eb4 ("cpu/hotplug: Split out cpu down functions") that makes the function used in all configurations, while older longterm releases so far don't seem to have a backport of 777c6e0daebb. The fix for the warning is trivial: move the unused function back into the #ifdef section where it was before. Link: https://kernelci.org/build/id/586fcacb59b514049ef6c3aa/logs/ Fixes: 1c0f4e0ebb79 ("hotplug: Make register and unregister notifier API symmetric") in v4.4.y Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index cd6d1258554e..40d20bf5de28 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -223,10 +223,6 @@ static int cpu_notify(unsigned long val, void *v) return __cpu_notify(val, v, -1, NULL); } -static void cpu_notify_nofail(unsigned long val, void *v) -{ - BUG_ON(cpu_notify(val, v)); -} EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -245,6 +241,11 @@ void __unregister_cpu_notifier(struct notifier_block *nb) EXPORT_SYMBOL(__unregister_cpu_notifier); #ifdef CONFIG_HOTPLUG_CPU +static void cpu_notify_nofail(unsigned long val, void *v) +{ + BUG_ON(cpu_notify(val, v)); +} + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id -- cgit v1.2.3 From 6053479cbbc58df088c2ab248f0d1a46109dfc0c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Dec 2016 12:10:37 +0100 Subject: tick/broadcast: Prevent NULL pointer dereference commit c1a9eeb938b5433947e5ea22f89baff3182e7075 upstream. When a disfunctional timer, e.g. dummy timer, is installed, the tick core tries to setup the broadcast timer. If no broadcast device is installed, the kernel crashes with a NULL pointer dereference in tick_broadcast_setup_oneshot() because the function has no sanity check. Reported-by: Mason Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Anna-Maria Gleixner Cc: Richard Cochran Cc: Sebastian Andrzej Siewior Cc: Daniel Lezcano Cc: Peter Zijlstra , Cc: Sebastian Frias Cc: Thibaud Cornic Cc: Robin Murphy Link: http://lkml.kernel.org/r/1147ef90-7877-e4d2-bb2b-5c4fa8d3144b@free.fr Signed-off-by: Greg Kroah-Hartman --- kernel/time/tick-broadcast.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f6aae7977824..d2a20e83ebae 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); + if (!bc) + return; + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = clockevent_state_periodic(bc); -- cgit v1.2.3 From 52a2ef75c34af99c4c383dfe357ed1bb84a49bcc Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 9 Jan 2017 17:20:11 +0000 Subject: DEBUG: sched/fair: Fix missing sched_load_avg_cpu events update_cfs_rq_load_avg is called from update_blocked_averages without triggering the sched_load_avg_cpu event. Move the event trigger to inside update_cfs_rq_load_avg to avoid this missing event. Change-Id: I6c4f66f687a644e4e7f798db122d28a8f5919b7b Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30d76a18ae1a..ad1507e420e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2757,6 +2757,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + return decayed || removed; } @@ -2780,7 +2782,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); - trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -- cgit v1.2.3 From ee620ddd6581cf9779d27677f6f0f11e3f939a8c Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 10 Jan 2017 11:31:01 +0000 Subject: DEBUG: sched/fair: Fix sched_load_avg_cpu events for task_groups The current sched_load_avg_cpu event traces the load for any cfs_rq that is updated. This is not representative of the CPU load - instead we should only trace this event when the cfs_rq being updated is in the root_task_group. Change-Id: I345c2f13f6b5718cb4a89beb247f7887ce97ed6b Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad1507e420e8..3331f453a17f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2757,7 +2757,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); return decayed || removed; } -- cgit v1.2.3 From 70429b970bde30e386790a83a97fc01ed7dbfd3b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Jan 2017 16:57:36 -0800 Subject: mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done} commit f931ab479dd24cf7a2c6e2df19778406892591fb upstream. Both arch_add_memory() and arch_remove_memory() expect a single threaded context. For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does not hold any locks over this check and branch: if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); The result is that two threads calling devm_memremap_pages() simultaneously can end up colliding on pgd initialization. This leads to crash signatures like the following where the loser of the race initializes the wrong pgd entry: BUG: unable to handle kernel paging request at ffff888ebfff0000 IP: memcpy_erms+0x6/0x10 PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */ Oops: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13 task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000 RIP: memcpy_erms+0x6/0x10 [..] Call Trace: ? pmem_do_bvec+0x205/0x370 [nd_pmem] ? blk_queue_enter+0x3a/0x280 pmem_rw_page+0x38/0x80 [nd_pmem] bdev_read_page+0x84/0xb0 Hold the standard memory hotplug mutex over calls to arch_{add,remove}_memory(). Fixes: 41e94a851304 ("add devm_memremap_pages") Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/memremap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 25ced161ebeb..f719c925cb54 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -159,7 +159,9 @@ static void devm_memremap_pages_release(struct device *dev, void *res) struct page_map *page_map = res; /* pages are dead and unused, undo the arch mapping */ + mem_hotplug_begin(); arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + mem_hotplug_done(); } void *devm_memremap_pages(struct device *dev, struct resource *res) @@ -189,7 +191,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (nid < 0) nid = numa_mem_id(); + mem_hotplug_begin(); error = arch_add_memory(nid, res->start, resource_size(res), true); + mem_hotplug_done(); if (error) { devres_free(page_map); return ERR_PTR(error); -- cgit v1.2.3 From 3d27cd4b25272943300d125b20d315fd96ae0794 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 16 Dec 2016 14:30:35 -0800 Subject: jump_labels: API for flushing deferred jump label updates commit b6416e61012429e0277bd15a229222fd17afc1c1 upstream. Modules that use static_key_deferred need a way to synchronize with any delayed work that is still pending when the module is unloaded. Introduce static_key_deferred_flush() which flushes any pending jump label updates. Signed-off-by: David Matlack Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- kernel/jump_label.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4b353e0be121..453ec4232852 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -138,6 +138,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); + flush_delayed_work(&key->work); +} +EXPORT_SYMBOL_GPL(static_key_deferred_flush); + void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { -- cgit v1.2.3 From d1b232c2ce53d35dfb9c9e7447eeef42cea20e42 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Jan 2017 18:20:55 -0800 Subject: sysctl: fix proc_doulongvec_ms_jiffies_minmax() commit ff9f8a7cf935468a94d9927c68b00daae701667e upstream. We perform the conversion between kernel jiffies and ms only when exporting kernel value to user space. We need to do the opposite operation when value is written by user. Only matters when HZ != 1000 Signed-off-by: Eric Dumazet Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 999e025bf68e..2f0d157258a2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2414,6 +2414,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int break; if (neg) continue; + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) continue; *i = val; -- cgit v1.2.3 From d49d465d178fd62c0420827fb3c838b7d11a7ec1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2017 23:15:08 +0100 Subject: perf/core: Fix PERF_RECORD_MMAP2 prot/flags for anonymous memory commit 0b3589be9b98994ce3d5aeca52445d1f5627c4ba upstream. Andres reported that MMAP2 records for anonymous memory always have their protection field 0. Turns out, someone daft put the prot/flags generation code in the file branch, leaving them unset for anonymous memory. Reported-by: Andres Freund Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@kernel.org Cc: anton@ozlabs.org Cc: namhyung@kernel.org Fixes: f972eb63b100 ("perf: Pass protection and flags bits through mmap2 interface") Link: http://lkml.kernel.org/r/20170126221508.GF6536@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bc6371b0e4fb..9bbe9ac23cf2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6039,6 +6039,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; char *name; + if (vma->vm_flags & VM_READ) + prot |= PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vma->vm_flags & VM_EXEC) + prot |= PROT_EXEC; + + if (vma->vm_flags & VM_MAYSHARE) + flags = MAP_SHARED; + else + flags = MAP_PRIVATE; + + if (vma->vm_flags & VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vma->vm_flags & VM_MAYEXEC) + flags |= MAP_EXECUTABLE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + if (vma->vm_flags & VM_HUGETLB) + flags |= MAP_HUGETLB; + if (file) { struct inode *inode; dev_t dev; @@ -6065,27 +6086,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) maj = MAJOR(dev); min = MINOR(dev); - if (vma->vm_flags & VM_READ) - prot |= PROT_READ; - if (vma->vm_flags & VM_WRITE) - prot |= PROT_WRITE; - if (vma->vm_flags & VM_EXEC) - prot |= PROT_EXEC; - - if (vma->vm_flags & VM_MAYSHARE) - flags = MAP_SHARED; - else - flags = MAP_PRIVATE; - - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; - if (vma->vm_flags & VM_MAYEXEC) - flags |= MAP_EXECUTABLE; - if (vma->vm_flags & VM_LOCKED) - flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) - flags |= MAP_HUGETLB; - goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) { -- cgit v1.2.3 From a0cfd72b3fcf3b2634494bc83340160c7736fc18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2016 15:27:07 +0100 Subject: sched/rt: Fix PI handling vs. sched_setscheduler() Andrea Parri reported: > I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not > handled correctly: > > T1 (prio = 20) > lock(rtmutex); > > T2 (prio = 20) > blocks on rtmutex (rt_nr_boosted = 0 on T1's rq) > > T1 (prio = 20) > sys_set_scheduler(prio = 0) > [new_effective_prio == oldprio] > T1 prio = 20 (rt_nr_boosted = 0 on T1's rq) > > The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted()); > in particular, if we continue with > > T1 (prio = 20) > unlock(rtmutex) > wakeup(T2) > adjust_prio(T1) > [prio != rt_mutex_getprio(T1)] > dequeue(T1) > rt_nr_boosted = (unsigned long)(-1) > ... > T1 prio = 0 > > then we end up leaving rt_nr_boosted in an "inconsistent" state. > > The simple program attached could reproduce the previous scenario; note > that, as a consequence of the presence of this state, the "assertion" > > WARN_ON(!rt_nr_running && rt_nr_boosted) > > from dec_rt_group() may trigger. So normally we dequeue/enqueue tasks in sched_setscheduler(), which would ensure the accounting stays correct. However in the early PI path we fail to do so. So this was introduced at around v3.14, by: c365c292d059 ("sched: Consider pi boosting in setscheduler()") which fixed another problem exactly because that dequeue/enqueue, joy. Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it preserve runqueue location with that option. This requires decoupling the on_rt_rq() state from being on the list. In order to allow for explicit movement during the SAVE/RESTORE, introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these cases to preserve other invariants. Respecting the SAVE/RESTORE flags also has the (nice) side-effect that things like sys_nice()/sys_sched_setaffinity() also do not reorder FIFO tasks (whereas they used to before this patch). Change-Id: I1450923252f55dba19f450008db813113eb06c76 Reported-by: Andrea Parri Tested-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar [pkondeti@codeaurora.org: Fix trivial merge conflict] Git-commit: ff77e468535987b3d21b7bd4da15608ea3ce7d0b Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Signed-off-by: Pavankumar Kondeti --- kernel/sched/core.c | 39 +++++++++++++----------- kernel/sched/rt.c | 86 ++++++++++++++++++++++++++++++++++++---------------- kernel/sched/sched.h | 38 ++++++++++++++++++----- 3 files changed, 112 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3fcadbae663d..57452c970a2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2321,6 +2321,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -3735,7 +3739,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3763,11 +3767,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3785,7 +3793,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3793,7 +3801,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3808,7 +3816,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4164,6 +4172,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -4346,17 +4355,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4366,15 +4372,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -8707,7 +8712,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -8731,7 +8736,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b72352bbd752..07b2c63e4983 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -441,7 +441,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -487,8 +487,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -504,7 +504,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -521,7 +521,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1257,7 +1257,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1270,26 +1293,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1298,7 +1332,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1311,31 +1345,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1351,7 +1385,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1363,7 +1397,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 360e298398fb..75500042fd32 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1941,19 +1941,41 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 -#define ENQUEUE_WAKEUP_NEW 0x20 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 +#define ENQUEUE_WAKEUP_NEW 0x40 #define RETRY_TASK ((void *)-1UL) -- cgit v1.2.3 From e6394c7d1c191005baa2d39ca2cc325374a7a35f Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 30 Dec 2016 16:17:55 +0800 Subject: futex: Move futex_init() to core_initcall commit 25f71d1c3e98ef0e52371746220d66458eac75bc upstream. The UEVENT user mode helper is enabled before the initcalls are executed and is available when the root filesystem has been mounted. The user mode helper is triggered by device init calls and the executable might use the futex syscall. futex_init() is marked __initcall which maps to device_initcall, but there is no guarantee that futex_init() is invoked _before_ the first device init call which triggers the UEVENT user mode helper. If the user mode helper uses the futex syscall before futex_init() then the syscall crashes with a NULL pointer dereference because the futex subsystem has not been initialized yet. Move futex_init() to core_initcall so futexes are initialized before the root filesystem is mounted and the usermode helper becomes available. [ tglx: Rewrote changelog ] Signed-off-by: Yang Yang Cc: jiang.biao2@zte.com.cn Cc: jiang.zhengxiong@zte.com.cn Cc: zhong.weidong@zte.com.cn Cc: deng.huali@zte.com.cn Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1483085875-6130-1-git-send-email-yang.yang29@zte.com.cn Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9d8163afd87c..9d251dc3ec40 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3199,4 +3199,4 @@ static int __init futex_init(void) return 0; } -__initcall(futex_init); +core_initcall(futex_init); -- cgit v1.2.3 From efa061998d226a531bda4e8e373d9ef9edd0bb93 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 18 Feb 2017 03:42:54 -0800 Subject: printk: use rcuidle console tracepoint commit fc98c3c8c9dcafd67adcce69e6ce3191d5306c9c upstream. Use rcuidle console tracepoint because, apparently, it may be issued from an idle CPU: hw-breakpoint: Failed to enable monitor mode on CPU 0. hw-breakpoint: CPU 0 failed to disable vector catch =============================== [ ERR: suspicious RCU usage. ] 4.10.0-rc8-next-20170215+ #119 Not tainted ------------------------------- ./include/trace/events/printk.h:32 suspicious rcu_dereference_check() usage! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 0 RCU used illegally from extended quiescent state! 2 locks held by swapper/0/0: #0: (cpu_pm_notifier_lock){......}, at: [] cpu_pm_exit+0x10/0x54 #1: (console_lock){+.+.+.}, at: [] vprintk_emit+0x264/0x474 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-rc8-next-20170215+ #119 Hardware name: Generic OMAP4 (Flattened Device Tree) console_unlock vprintk_emit vprintk_default printk reset_ctrl_regs dbg_cpu_pm_notify notifier_call_chain cpu_pm_exit omap_enter_idle_coupled cpuidle_enter_state cpuidle_enter_state_coupled do_idle cpu_startup_entry start_kernel This RCU warning, however, is suppressed by lockdep_off() in printk(). lockdep_off() increments the ->lockdep_recursion counter and thus disables RCU_LOCKDEP_WARN() and debug_lockdep_rcu_enabled(), which want lockdep to be enabled "current->lockdep_recursion == 0". Link: http://lkml.kernel.org/r/20170217015932.11898-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Reported-by: Tony Lindgren Tested-by: Tony Lindgren Acked-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) Cc: Petr Mladek Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Lindgren Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c048e34b177f..0b5613554769 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1436,7 +1436,7 @@ static void call_console_drivers(int level, { struct console *con; - trace_console(text, len); + trace_console_rcuidle(text, len); if (level >= console_loglevel && !ignore_loglevel) return; -- cgit v1.2.3 From 73f527b67c02b5273d71bfcc25be7693e3cbad86 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Mon, 27 Feb 2017 09:25:59 +0530 Subject: sched: Print aggregation status in sched_get_busy trace event Aggregation for frequency is not enabled all the time. The aggregated load is attached to the most busy CPU only when the group load is above a certain threshold. Print the aggregation status in sched_get_busy trace event to make debugging and testing easier. Change-Id: Icb916f362ea0fa8b5dc7d23cb384168d86159687 Signed-off-by: Pavankumar Kondeti --- kernel/sched/hmp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 744c60dfb4fb..df47c26ab6d2 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -3274,7 +3274,9 @@ exit_early: trace_sched_get_busy(cpu, busy[i].prev_load, busy[i].new_task_load, busy[i].predicted_load, - early_detection[i]); + early_detection[i], + aggregate_load && + cpu == max_busy_cpu); i++; } } -- cgit v1.2.3 From 738d2a8feabb9dfce7595ad7db045d5442ac289c Mon Sep 17 00:00:00 2001 From: Mohit Aggarwal Date: Thu, 16 Feb 2017 14:29:20 +0530 Subject: alarmtimer: Program mpm wakeup time in milliseconds Currently, mpm wakeup time is programmed in seconds due to which there is a possibility that wakeup happens later than the expected time. This patch fixes the issue by programming the mpm wakeup time in milliseconds. CRs-Fixed: 2010001 Change-Id: I5c4905a0386e60ae54876f30d89f445fd06a161c Signed-off-by: Mohit Aggarwal --- kernel/time/alarmtimer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0cdc34ebd8d1..2af5687b83c9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -412,12 +412,10 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); if (poweron_alarm) { - struct rtc_time tm_val; - unsigned long secs; + uint64_t msec = 0; - tm_val = rtc_ktime_to_tm(min); - rtc_tm_to_time(&tm_val, &secs); - lpm_suspend_wake_time(secs); + msec = ktime_to_ms(min); + lpm_suspend_wake_time(msec); } else { /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); -- cgit v1.2.3 From db79acdf6fc2f400bf7445d2c69489af70651d52 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Wed, 1 Mar 2017 15:03:05 -0800 Subject: watchdog: Induce non secure watchdog bite for lockup Induce non secure watchdog bite when ever kernel soft lockup or hard lockups are detected. So that proper context of cpus can be collected for debugging. Change-Id: I613391d8d53fe52ce7934cdc910fb135c4e0fbf2 Signed-off-by: Prasad Sodagudi --- kernel/watchdog.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f2813e137b23..91fa701b4a24 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * The run state of the lockup detectors is controlled by the content of the @@ -366,8 +367,11 @@ static void watchdog_check_hardlockup_other_cpu(void) if (per_cpu(hard_watchdog_warn, next_cpu) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + if (hardlockup_panic) { + pr_err("Watchdog detected hard LOCKUP on cpu %u", + next_cpu); + msm_trigger_wdog_bite(); + } else WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); @@ -430,6 +434,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + if (hardlockup_panic) + msm_trigger_wdog_bite(); + print_modules(); print_irqtrace_events(current); if (regs) @@ -552,6 +559,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + + if (softlockup_panic) + msm_trigger_wdog_bite(); __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); -- cgit v1.2.3 From 54547c9f7577d6503938b32ea796ef8abba102fc Mon Sep 17 00:00:00 2001 From: "Se Wang (Patrick) Oh" Date: Fri, 29 May 2015 14:57:05 -0700 Subject: clocksource: add API to force re-selection of the best clocksource As the best clocksource is not selected till core boot completion, only periodic tick timer works and it increases jiffies by one at every tick updates. If interrupt is disabled more than one tick(10ms), timer interrupts are missed and jiffies can't be updated at every 10ms and it can be behind the real time. So add API to force re- selection of the best clocksource among registered clocksources so that the best clocksource can be selected whenever it is available. Change-Id: I481de3cdf1df8f0e35ed10aee7ab3882bf7a35b3 Signed-off-by: Se Wang (Patrick) Oh Signed-off-by: Prasad Sodagudi --- kernel/time/clocksource.c | 48 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b98810d2f3b4..89cc82a38e4d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -108,7 +108,7 @@ static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); -static void clocksource_select(void); +static void clocksource_select(bool force); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -415,7 +415,7 @@ static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); if (__clocksource_watchdog_kthread()) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -555,11 +555,12 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur, + bool force) { struct clocksource *cs; - if (!finished_booting || list_empty(&clocksource_list)) + if ((!finished_booting && !force) || list_empty(&clocksource_list)) return NULL; /* @@ -577,13 +578,13 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) return NULL; } -static void __clocksource_select(bool skipcur) +static void __clocksource_select(bool skipcur, bool force) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot, skipcur); + best = clocksource_find_best(oneshot, skipcur, force); if (!best) return; @@ -623,22 +624,40 @@ static void __clocksource_select(bool skipcur) * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static void clocksource_select(void) +static void clocksource_select(bool force) { - __clocksource_select(false); + return __clocksource_select(false, force); } static void clocksource_select_fallback(void) { - __clocksource_select(true); + __clocksource_select(true, false); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ -static inline void clocksource_select(void) { } + +static inline void clocksource_select(bool force) { } static inline void clocksource_select_fallback(void) { } #endif +/** + * clocksource_select_force - Force re-selection of the best clocksource + * among registered clocksources + * + * clocksource_select() can't select the best clocksource before + * calling clocksource_done_booting() and since clocksource_select() + * should be called with clocksource_mutex held, provide a new API + * can be called from other files to select best clockrouce irrespective + * of finished_booting flag. + */ +void clocksource_select_force(void) +{ + mutex_lock(&clocksource_mutex); + clocksource_select(true); + mutex_unlock(&clocksource_mutex); +} + /* * clocksource_done_booting - Called near the end of core bootup * @@ -655,7 +674,7 @@ static int __init clocksource_done_booting(void) * Run the watchdog first to eliminate unstable clock sources */ __clocksource_watchdog_kthread(); - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -744,6 +763,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + /** * __clocksource_register_scale - Used to install new clocksources * @cs: clocksource to be registered @@ -765,7 +785,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; @@ -788,7 +808,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } @@ -892,7 +912,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3 From 7005c6eec7b9768186bdb9630ca8a2043c8aed8f Mon Sep 17 00:00:00 2001 From: Omprakash Dhyade Date: Wed, 8 Mar 2017 12:58:21 -0800 Subject: tracing: Free saved_tgids memory in free_saved_cmdlines_buffer saved_tgids memory was not freed in free_saved_cmdlines_buffer which can cause memory-leak when /d/tracing/saved_cmdlines_size node value is changed. Fix it by freeing saved_tgids in free_saved_cmdlines_buffer. Change-Id: I006870c858b7306b7d9c840b7712061985dda310 Signed-off-by: Omprakash Dhyade --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9b15d6eb1622..66d9e907aa07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3991,6 +3991,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); + kfree(s->saved_tgids); kfree(s); } -- cgit v1.2.3 From 4f659aa55edf0c322710abfd9389501f8f68dc79 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Mon, 6 Mar 2017 11:04:46 -0800 Subject: rcu: Induce msm watchdog bite for rcu stalls Every RCU stall need to be debugged, So collect the ram dumps on every RCU stall to debug further by inducing non secure watchdog bite whenever rcu stall detected. Change-Id: I6c1cfddc92f06b48c3f22fe9970b70f2ec670bf6 Signed-off-by: Prasad Sodagudi --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..2cb46d51d715 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include +#include + #include "tree.h" #include "rcu.h" @@ -1298,6 +1300,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce watchdog bite */ + msm_trigger_wdog_bite(); +#endif + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1333,6 +1340,11 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce non secure watchdog bite to collect context */ + msm_trigger_wdog_bite(); +#endif + /* * Attempt to revive the RCU machinery by forcing a context switch. * -- cgit v1.2.3 From c0ef1f537a9728a91475f8f73cf8b5764e7a9848 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 3 Nov 2016 10:29:28 -0600 Subject: Fix: Disable sys_membarrier when nohz_full is enabled commit 907565337ebf998a68cb5c5b2174ce5e5da065eb upstream. Userspace applications should be allowed to expect the membarrier system call with MEMBARRIER_CMD_SHARED command to issue memory barriers on nohz_full CPUs, but synchronize_sched() does not take those into account. Given that we do not want unrelated processes to be able to affect real-time sensitive nohz_full CPUs, simply return ENOSYS when membarrier is invoked on a kernel with enabled nohz_full CPUs. Signed-off-by: Mathieu Desnoyers CC: Josh Triplett CC: Steven Rostedt Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Chris Metcalf Cc: Rik van Riel Acked-by: Lai Jiangshan Reviewed-by: Josh Triplett Signed-off-by: Greg Kroah-Hartman --- kernel/membarrier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/membarrier.c b/kernel/membarrier.c index 536c727a56e9..9f9284f37f8d 100644 --- a/kernel/membarrier.c +++ b/kernel/membarrier.c @@ -16,6 +16,7 @@ #include #include +#include /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -51,6 +52,9 @@ */ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) { + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -ENOSYS; if (unlikely(flags)) return -EINVAL; switch (cmd) { -- cgit v1.2.3 From 27e15c3ae3a1d5065a053be2a2b24429417aa995 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 17 Mar 2017 15:29:13 +0530 Subject: core_ctl: kill maintenance of online state The online state is cached in CPU hotplug callbacks. This is used in determining the potential candidates for isolation or unisolation. The current code has checks in hotplug notifier callback to correct the online state if it is out of sync with the actualy CPU state. All this maintenance can be avoided by directly using cpu_online(). This patch fixes an incorrect warning message emitted from CPU_UP_CANCELED event callback. Change-Id: I9681ce6c5bc701507427d60db397ca4966fbdb58 Signed-off-by: Pavankumar Kondeti --- kernel/sched/core_ctl.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 1e3accddd103..d90cd817089c 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -50,7 +50,6 @@ struct cluster_data { }; struct cpu_data { - bool online; bool is_busy; unsigned int busy; unsigned int cpu; @@ -252,7 +251,7 @@ static ssize_t show_cpus(const struct cluster_data *state, char *buf) list_for_each_entry(c, &state->lru, sib) { count += snprintf(buf + count, PAGE_SIZE - count, "CPU%u (%s)\n", c->cpu, - c->online ? "Online" : "Offline"); + cpu_online(c->cpu) ? "Online" : "Offline"); } spin_unlock_irqrestore(&state_lock, flags); return count; @@ -286,7 +285,8 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf) count += snprintf(buf + count, PAGE_SIZE - count, "\tCPU: %u\n", c->cpu); count += snprintf(buf + count, PAGE_SIZE - count, - "\tOnline: %u\n", c->online); + "\tOnline: %u\n", + cpu_online(c->cpu)); count += snprintf(buf + count, PAGE_SIZE - count, "\tActive: %u\n", !cpu_isolated(c->cpu)); @@ -534,7 +534,7 @@ static unsigned int get_active_cpu_count(const struct cluster_data *cluster) static bool is_active(const struct cpu_data *state) { - return state->online && !cpu_isolated(state->cpu); + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); } static bool adjustment_possible(const struct cluster_data *cluster, @@ -815,7 +815,7 @@ static void __try_to_unisolate(struct cluster_data *cluster, if (!c->isolated_by_us) continue; - if ((c->online && !cpu_isolated(c->cpu)) || + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || (!force && c->not_preferred)) continue; if (cluster->active_cpus == need) @@ -904,19 +904,7 @@ static int __ref cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - - /* If online state of CPU somehow got out of sync, fix it. */ - if (state->online) { - state->online = false; - cluster->active_cpus = get_active_cpu_count(cluster); - pr_warn("CPU%d offline when state is online\n", cpu); - } - break; - case CPU_ONLINE: - - state->online = true; cluster->active_cpus = get_active_cpu_count(cluster); /* @@ -941,15 +929,6 @@ static int __ref cpu_callback(struct notifier_block *nfb, /* Move a CPU to the end of the LRU when it goes offline. */ move_cpu_lru(state); - /* Fall through */ - - case CPU_UP_CANCELED: - - /* If online state of CPU somehow got out of sync, fix it. */ - if (!state->online) - pr_warn("CPU%d online when state is offline\n", cpu); - - state->online = false; state->busy = 0; cluster->active_cpus = get_active_cpu_count(cluster); break; @@ -1028,8 +1007,6 @@ static int cluster_init(const struct cpumask *mask) state = &per_cpu(cpu_state, cpu); state->cluster = cluster; state->cpu = cpu; - if (cpu_online(cpu)) - state->online = true; list_add_tail(&state->sib, &cluster->lru); } cluster->active_cpus = get_active_cpu_count(cluster); -- cgit v1.2.3 From ad8d41e4d071e0fd878e0f5c5166fd668cdc9270 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 17 Mar 2017 15:06:47 +0530 Subject: core_ctl: use pr_fmt We use pr_* to print info/warn/error messages. Use pr_fmt so that the messages are prefixed by "core_ctl:" string. Change-Id: Icc84db30ebf0855599431366d090a2b1c94da6d0 Signed-off-by: Pavankumar Kondeti --- kernel/sched/core_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index d90cd817089c..4be384433156 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -10,6 +10,8 @@ * GNU General Public License for more details. */ +#define pr_fmt(fmt) "core_ctl: " fmt + #include #include #include -- cgit v1.2.3 From cd85775d9c81c5c301907a61a6fd2709f3474369 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 17 Mar 2017 15:52:30 +0530 Subject: core_ctl: remove "cpus" sysfs file that prints CPU online status core_ctl module no longer hotplug the CPUs. Hence remove the /sys/devices/system/cpu/cpuX/core_ctl/cpus file interface which pints the online/offline state of all CPUs in the cluster. The global_state file interface can still be used to know all the state information about all CPUs. Replace "Active" with "Isolated" string in this file output. Otherwise we see that an offline CPU as active. Change-Id: Id6ffb1c3ddfe85cf34ab670ceea8b53e5b2f60e2 Signed-off-by: Pavankumar Kondeti --- kernel/sched/core_ctl.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 4be384433156..983159cc0646 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -243,22 +243,6 @@ static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); } -static ssize_t show_cpus(const struct cluster_data *state, char *buf) -{ - struct cpu_data *c; - ssize_t count = 0; - unsigned long flags; - - spin_lock_irqsave(&state_lock, flags); - list_for_each_entry(c, &state->lru, sib) { - count += snprintf(buf + count, PAGE_SIZE - count, - "CPU%u (%s)\n", c->cpu, - cpu_online(c->cpu) ? "Online" : "Offline"); - } - spin_unlock_irqrestore(&state_lock, flags); - return count; -} - static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) { return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); @@ -290,8 +274,8 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf) "\tOnline: %u\n", cpu_online(c->cpu)); count += snprintf(buf + count, PAGE_SIZE - count, - "\tActive: %u\n", - !cpu_isolated(c->cpu)); + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); count += snprintf(buf + count, PAGE_SIZE - count, "\tFirst CPU: %u\n", cluster->first_cpu); @@ -378,7 +362,6 @@ core_ctl_attr_rw(busy_up_thres); core_ctl_attr_rw(busy_down_thres); core_ctl_attr_rw(task_thres); core_ctl_attr_rw(is_big_cluster); -core_ctl_attr_ro(cpus); core_ctl_attr_ro(need_cpus); core_ctl_attr_ro(active_cpus); core_ctl_attr_ro(global_state); @@ -392,7 +375,6 @@ static struct attribute *default_attrs[] = { &busy_down_thres.attr, &task_thres.attr, &is_big_cluster.attr, - &cpus.attr, &need_cpus.attr, &active_cpus.attr, &global_state.attr, -- cgit v1.2.3 From d71b1a76b1a797648ec6740f1a206e597d6b139a Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Tue, 21 Mar 2017 15:03:35 +0530 Subject: sched/core_ctl: Fix state_lock spinlock contention There is no need to call core_ctl_check() from scheduler tick path on all CPUs. This results in core_ctl's state_lock spin lock contention. Assign this job to the CPU which is responsible for updating the ticks. Change-Id: I9664037cc25c204d532bdd0f006c7e27ef143497 Signed-off-by: Pavankumar Kondeti --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 312ffdad034a..1017a3f77391 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -88,6 +88,7 @@ #include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" +#include "../time/tick-internal.h" #define CREATE_TRACE_POINTS #include @@ -3260,7 +3261,8 @@ void scheduler_tick(void) if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); - core_ctl_check(wallclock); + if (cpu == tick_do_timer_cpu) + core_ctl_check(wallclock); } #ifdef CONFIG_NO_HZ_FULL -- cgit v1.2.3 From 2d328380d92563d779c75384749bef140c005367 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Tue, 21 Mar 2017 14:00:09 +0530 Subject: core_ctl: Add a kernel parameter to disable core_ctl Add a kernel parameter called "core_ctl_disable_cpumask" to specify the CPUs for which core_ctl is not needed. As core_ctl operates on a cluster basis, all of the CPUs in a given cluster must be specified to disable core_ctl on that cluster. Change-Id: Idfdc5b3aa9f54bafe20489e5ded9d96da6eff21c Signed-off-by: Pavankumar Kondeti --- kernel/sched/core_ctl.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 983159cc0646..a904c18704de 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -931,6 +931,42 @@ static struct notifier_block __refdata cpu_notifier = { /* ============================ init code ============================== */ +static cpumask_var_t core_ctl_disable_cpumask; +static bool core_ctl_disable_cpumask_present; + +static int __init core_ctl_disable_setup(char *str) +{ + if (!*str) + return -EINVAL; + + alloc_bootmem_cpumask_var(&core_ctl_disable_cpumask); + + if (cpulist_parse(str, core_ctl_disable_cpumask) < 0) { + free_bootmem_cpumask_var(core_ctl_disable_cpumask); + return -EINVAL; + } + + core_ctl_disable_cpumask_present = true; + pr_info("disable_cpumask=%*pbl\n", + cpumask_pr_args(core_ctl_disable_cpumask)); + + return 0; +} +early_param("core_ctl_disable_cpumask", core_ctl_disable_setup); + +static bool should_skip(const struct cpumask *mask) +{ + if (!core_ctl_disable_cpumask_present) + return false; + + /* + * We operate on a cluster basis. Disable the core_ctl for + * a cluster, if all of it's cpus are specified in + * core_ctl_disable_cpumask + */ + return cpumask_subset(mask, core_ctl_disable_cpumask); +} + static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) { unsigned int i; @@ -952,6 +988,9 @@ static int cluster_init(const struct cpumask *mask) unsigned int cpu; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + if (should_skip(mask)) + return 0; + if (find_cluster_by_first_cpu(first_cpu)) return 0; @@ -1052,6 +1091,9 @@ static int __init core_ctl_init(void) { unsigned int cpu; + if (should_skip(cpu_possible_mask)) + return 0; + core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE) * NSEC_PER_MSEC; -- cgit v1.2.3 From 2ab7675364d7510b1e0f8cd43b1aca6d12dafa08 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Tue, 21 Mar 2017 14:21:21 +0530 Subject: core_ctl: Bail out early from the boost API when core_ctl is disabled core_ctl can be completely disabled from a kernel parameter now. Add a check to bail out early from the boost API. Change-Id: Ib825500b0cb2c06af2cfcb82e5d79f91e9dd7b3b Signed-off-by: Pavankumar Kondeti --- kernel/sched/core_ctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index a904c18704de..1dde338d30f2 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -653,6 +653,9 @@ int core_ctl_set_boost(bool boost) int ret = 0; bool boost_state_changed = false; + if (unlikely(!initialized)) + return 0; + spin_lock_irqsave(&state_lock, flags); for_each_cluster(cluster, index) { if (cluster->is_big_cluster) { -- cgit v1.2.3 From 775e281fb6c19350f5c65e83e7c4b768c67bbe44 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 30 Mar 2017 09:53:53 -0700 Subject: trace: ipc_logging: Avoid buffer overflow in ipc_log_string() In ipc_log_string() the return value from vsnprintf(), data_size, is used to increment ectxt.offset. However, this length could actually be much larger than that of ectxt.buff itself. This is a typical mistake of [v]snprintf() usage [1], in that it returns not the number of characters written but how many characters *would* have been written regardless of whether it was truncated. The result is that even though ectxt.buff itself is not overrun, the incorrect size in ectxt.offset will be later used as the length parameter when memcpy()'ing to the ipc_log_page's data, overflowing that memory and beyond. The write_page's write_offset would also indicate an out-of-bounds (greater than PAGE_SIZE) length. The fix is simple: use vscnprintf() instead of vsnprintf(). [1] https://lwn.net/Articles/69419/ Change-Id: I2e9d44e74f5f30a009732e31a554d82e31946999 Signed-off-by: Jack Pham --- kernel/trace/ipc_logging.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ipc_logging.c b/kernel/trace/ipc_logging.c index 2c3e0998d400..ed29c38cd7fb 100644 --- a/kernel/trace/ipc_logging.c +++ b/kernel/trace/ipc_logging.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. +/* Copyright (c) 2012-2017, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -507,8 +507,8 @@ int ipc_log_string(void *ilctxt, const char *fmt, ...) tsv_qtimer_write(&ectxt); avail_size = (MAX_MSG_SIZE - (ectxt.offset + hdr_size)); va_start(arg_list, fmt); - data_size = vsnprintf((ectxt.buff + ectxt.offset + hdr_size), - avail_size, fmt, arg_list); + data_size = vscnprintf((ectxt.buff + ectxt.offset + hdr_size), + avail_size, fmt, arg_list); va_end(arg_list); tsv_write_header(&ectxt, TSV_TYPE_BYTE_ARRAY, data_size); ectxt.offset += data_size; -- cgit v1.2.3