From 0d57cbc66b9018f548f6f0dd3b1aff8bf1a214d2 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 1 Sep 2016 16:14:44 -0700 Subject: kexec: fix double-free when failing to relocate the purgatory commit 070c43eea5043e950daa423707ae3c77e2f48edb upstream. If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs and pi->purgatory_buf. This is redundant, because in case of error kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which will also free those buffers. This causes two warnings like the following, one for pi->sechdrs and the other for pi->purgatory_buf: kexec-bzImage64: Loading purgatory failed ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0 Trying to vfree() nonexistent vm area (ffffc90000e91000) Modules linked in: CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4d/0x65 __warn+0xcb/0xf0 warn_slowpath_fmt+0x4f/0x60 ? find_vmap_area+0x19/0x70 ? kimage_file_post_load_cleanup+0x47/0xb0 __vunmap+0xc1/0xd0 vfree+0x2e/0x70 kimage_file_post_load_cleanup+0x5e/0xb0 SyS_kexec_file_load+0x448/0x680 ? putname+0x54/0x60 ? do_sys_open+0x190/0x1f0 entry_SYSCALL_64_fastpath+0x13/0x8f ---[ end trace 158bb74f5950ca2b ]--- Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree won't try to free a NULL pointer. Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com Signed-off-by: Thiago Jung Bauermann Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0028d2..6030efd4a188 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } -- cgit v1.2.3 From f750847daa22bee9cbb3309f11e8c2eef7bbe5c6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:38 +0200 Subject: mm: introduce get_task_exe_file commit cd81a9170e69e018bbaba547c1fd85a585f5697a upstream. For more convenient access if one has a pointer to the task. As a minor nit take advantage of the fact that only task lock + rcu are needed to safely grab ->exe_file. This saves mm refcount dance. Use the helper in proc_exe_link. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c485cb156772..8860d1f50d24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -763,6 +763,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) } EXPORT_SYMBOL(get_mm_exe_file); +/** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.2.3 From d0259cc85b4c37930c102c21c03f64b5afa25792 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:39 +0200 Subject: audit: fix exe_file access in audit_exe_compare commit 5efc244346f9f338765da3d592f7947b0afdc4b5 upstream. Prior to the change the function would blindly deference mm, exe_file and exe_file->f_inode, each of which could have been NULL or freed. Use get_task_exe_file to safely obtain stable exe_file. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/audit_watch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..939945a5649c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } -- cgit v1.2.3 From 06ec7a1d7646833cac76516fe78a23577cdb4a8a Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Tue, 9 Aug 2016 11:25:01 +0800 Subject: cpuset: make sure new tasks conform to the current config of the cpuset commit 06f4e94898918bcad00cdd4d349313a439d6911e upstream. A new task inherits cpus_allowed and mems_allowed masks from its parent, but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems before this new task is inserted into the cgroup's task list, the new task won't be updated accordingly. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 11eaf14b52c2..e120bd983ad0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2074,6 +2074,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2084,6 +2098,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = 1, }; -- cgit v1.2.3 From a37a538e82b037b51ad8418c5c33d1274cfe27d4 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 5 Sep 2016 13:16:40 +1000 Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task commit 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf upstream. The origin of the issue I've seen is related to a missing memory barrier between check for task->state and the check for task->on_rq. The task being woken up is already awake from a schedule() and is doing the following: do { schedule() set_current_state(TASK_(UN)INTERRUPTIBLE); } while (!cond); The waker, actually gets stuck doing the following in try_to_wake_up(): while (p->on_cpu) cpu_relax(); Analysis: The instance I've seen involves the following race: CPU1 CPU2 while () { if (cond) break; do { schedule(); set_current_state(TASK_UN..) } while (!cond); wakeup_routine() spin_lock_irqsave(wait_lock) raw_spin_lock_irqsave(wait_lock) wake_up_process() } try_to_wake_up() set_current_state(TASK_RUNNING); .. list_del(&waiter.list); CPU2 wakes up CPU1, but before it can get the wait_lock and set current state to TASK_RUNNING the following occurs: CPU3 wakeup_routine() raw_spin_lock_irqsave(wait_lock) if (!list_empty) wake_up_process() try_to_wake_up() raw_spin_lock_irqsave(p->pi_lock) .. if (p->on_rq && ttwu_wakeup()) .. while (p->on_cpu) cpu_relax() .. CPU3 tries to wake up the task on CPU1 again since it finds it on the wait_queue, CPU1 is spinning on wait_lock, but immediately after CPU2, CPU3 got it. CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and the task is spinning on the wait_lock. Interestingly since p->on_rq is checked under pi_lock, I've noticed that try_to_wake_up() finds p->on_rq to be 0. This was the most confusing bit of the analysis, but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq check is not reliable without this fix IMHO. The race is visible (based on the analysis) only when ttwu_queue() does a remote wakeup via ttwu_queue_remote. In which case the p->on_rq change is not done uder the pi_lock. The result is that after a while the entire system locks up on the raw_spin_irqlock_save(wait_lock) and the holder spins infintely Reproduction of the issue: The issue can be reproduced after a long run on my system with 80 threads and having to tweak available memory to very low and running memory stress-ng mmapfork test. It usually takes a long time to reproduce. I am trying to work on a test case that can reproduce the issue faster, but thats work in progress. I am still testing the changes on my still in a loop and the tests seem OK thus far. Big thanks to Benjamin and Nick for helping debug this as well. Ben helped catch the missing barrier, Nick caught every missing bit in my theory. Signed-off-by: Balbir Singh [ Updated comment to clarify matching barriers. Many architectures do not have a full barrier in switch_to() so that cannot be relied upon. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Benjamin Herrenschmidt Cc: Alexey Kardashevskiy Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea863bc22caf..20253dbc8610 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1945,6 +1945,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; -- cgit v1.2.3 From 268a8bf90ee36d159bdb73079dec9c0e83e362ef Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Sep 2016 16:14:47 -0700 Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings commit 236dec051078a8691950f56949612b4b74107e48 upstream. Using "make tinyconfig" produces a couple of annoying warnings that show up for build test machines all the time: .config:966:warning: override: NOHIGHMEM changes choice state .config:965:warning: override: SLOB changes choice state .config:963:warning: override: KERNEL_XZ changes choice state .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:933:warning: override: SLOB changes choice state .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:870:warning: override: SLOB changes choice state .config:868:warning: override: KERNEL_XZ changes choice state .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state I've made a previous attempt at fixing them and we discussed a number of alternatives. I tried changing the Makefile to use "merge_config.sh -n $(fragment-list)" but couldn't get that to work properly. This is yet another approach, based on the observation that we do want to see a warning for conflicting 'choice' options, and that we can simply make them non-conflicting by listing all other options as disabled. This is a trivial patch that we can apply independent of plans for other changes. Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log https://patchwork.kernel.org/patch/9212749/ Signed-off-by: Arnd Bergmann Reviewed-by: Josh Triplett Reviewed-by: Masahiro Yamada Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/configs/tiny.config | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y -- cgit v1.2.3 From 2fbc61d977c4ad1d448c67fa56d8c689121e7f19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 15:32:25 +0200 Subject: genirq/msi: Fix broken debug output commit 4364e1a29be16b2783c0bcbc263f61236af64281 upstream. virq is not required to be the same for all msi descs. Use the base irq number from the desc in the debug printk. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4b21779d5163..cd6009006510 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -298,6 +298,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else -- cgit v1.2.3