From abacd2fe3ca10b3ade57f3634053241a660002c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:31 -0800 Subject: coredump: set_dumpable: fix the theoretical race with itself set_dumpable() updates MMF_DUMPABLE_MASK in a non-trivial way to ensure that get_dumpable() can't observe the intermediate state, but this all can't help if multiple threads call set_dumpable() at the same time. And in theory commit_creds()->set_dumpable(SUID_DUMP_ROOT) racing with sys_prctl()->set_dumpable(SUID_DUMP_DISABLE) can result in SUID_DUMP_USER. Change this code to update both bits atomically via cmpxchg(). Note: this assumes that it is safe to mix bitops and cmpxchg. IOW, if, say, an architecture implements cmpxchg() using the locking (like arch/parisc/lib/bitops.c does), then it should use the same locks for set_bit/etc. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Alex Kelly Cc: "Eric W. Biederman" Cc: Josh Triplett Cc: Petr Matousek Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 49 +++++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 7ea097f6b341..f039386499db 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1614,43 +1614,24 @@ EXPORT_SYMBOL(set_binfmt); /* * set_dumpable converts traditional three-value dumpable to two flags and - * stores them into mm->flags. It modifies lower two bits of mm->flags, but - * these bits are not changed atomically. So get_dumpable can observe the - * intermediate state. To avoid doing unexpected behavior, get get_dumpable - * return either old dumpable or new one by paying attention to the order of - * modifying the bits. - * - * dumpable | mm->flags (binary) - * old new | initial interim final - * ---------+----------------------- - * 0 1 | 00 01 01 - * 0 2 | 00 10(*) 11 - * 1 0 | 01 00 00 - * 1 2 | 01 11 11 - * 2 0 | 11 10(*) 00 - * 2 1 | 11 11 01 - * - * (*) get_dumpable regards interim value of 10 as 11. + * stores them into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { - switch (value) { - case SUID_DUMP_DISABLE: - clear_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_USER: - set_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_ROOT: - set_bit(MMF_DUMP_SECURELY, &mm->flags); - smp_wmb(); - set_bit(MMF_DUMPABLE, &mm->flags); - break; - } + unsigned long old, new; + + do { + old = ACCESS_ONCE(mm->flags); + new = old & ~MMF_DUMPABLE_MASK; + + switch (value) { + case SUID_DUMP_ROOT: + new |= (1 << MMF_DUMP_SECURELY); + case SUID_DUMP_USER: + new |= (1<< MMF_DUMPABLE); + } + + } while (cmpxchg(&mm->flags, old, new) != old); } int __get_dumpable(unsigned long mm_flags) -- cgit v1.2.3 From 7288e1187ba935996232246916418c64bb88da30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:32 -0800 Subject: coredump: kill MMF_DUMPABLE and MMF_DUMP_SECURELY Nobody actually needs MMF_DUMPABLE/MMF_DUMP_SECURELY, they are only used to enforce the encoding of SUID_DUMP_* enum in mm->flags & MMF_DUMPABLE_MASK. Now that set_dumpable() updates both bits atomically we can kill them and simply store the value "as is" in 2 lower bits. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Alex Kelly Cc: "Eric W. Biederman" Cc: Josh Triplett Cc: Petr Matousek Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index f039386499db..f798da06abac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1613,33 +1613,24 @@ void set_binfmt(struct linux_binfmt *new) EXPORT_SYMBOL(set_binfmt); /* - * set_dumpable converts traditional three-value dumpable to two flags and - * stores them into mm->flags. + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { unsigned long old, new; + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; + do { old = ACCESS_ONCE(mm->flags); - new = old & ~MMF_DUMPABLE_MASK; - - switch (value) { - case SUID_DUMP_ROOT: - new |= (1 << MMF_DUMP_SECURELY); - case SUID_DUMP_USER: - new |= (1<< MMF_DUMPABLE); - } - + new = (old & ~MMF_DUMPABLE_MASK) | value; } while (cmpxchg(&mm->flags, old, new) != old); } int __get_dumpable(unsigned long mm_flags) { - int ret; - - ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; + return mm_flags & MMF_DUMPABLE_MASK; } /* -- cgit v1.2.3 From 942be3875a1931c379bbc37053829dd6847e0f3f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:34 -0800 Subject: coredump: make __get_dumpable/get_dumpable inline, kill fs/coredump.h 1. Remove fs/coredump.h. It is not clear why do we need it, it only declares __get_dumpable(), signal.c includes it for no reason. 2. Now that get_dumpable() and __get_dumpable() are really trivial make them inline in linux/sched.h. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Alex Kelly Cc: "Eric W. Biederman" Cc: Josh Triplett Cc: Petr Matousek Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index f798da06abac..9cbad5b0187e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,7 +62,6 @@ #include #include "internal.h" -#include "coredump.h" #include @@ -1609,7 +1608,6 @@ void set_binfmt(struct linux_binfmt *new) if (new) __module_get(new->module); } - EXPORT_SYMBOL(set_binfmt); /* @@ -1628,22 +1626,6 @@ void set_dumpable(struct mm_struct *mm, int value) } while (cmpxchg(&mm->flags, old, new) != old); } -int __get_dumpable(unsigned long mm_flags) -{ - return mm_flags & MMF_DUMPABLE_MASK; -} - -/* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean - * value. - */ -int get_dumpable(struct mm_struct *mm) -{ - return __get_dumpable(mm->flags); -} - SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, -- cgit v1.2.3 From 83f62a2eacb1d6945c78523f20e0c34b5d94913c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:49 -0800 Subject: exec:check_unsafe_exec: use while_each_thread() rather than next_thread() next_thread() should be avoided, change check_unsafe_exec() to use while_each_thread(). Nobody except signal->curr_target actually needs next_thread-like code, and we need to change (fix) this interface. This particular code is fine, p == current. But in general the code like this can loop forever if p exits and next_thread(t) can't reach the unhashed thread. This also saves 32 bytes. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 9cbad5b0187e..81ae6212187a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1243,10 +1243,11 @@ static int check_unsafe_exec(struct linux_binprm *bprm) if (current->no_new_privs) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - for (t = next_thread(p); t != p; t = next_thread(t)) { + while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } -- cgit v1.2.3 From 9e00cdb091b008cb3c78192651180896de412a63 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:50 -0800 Subject: exec:check_unsafe_exec: kill the dead -EAGAIN and clear_in_exec logic fs_struct->in_exec == T means that this ->fs is used by a single process (thread group), and one of the treads does do_execve(). To avoid the mt-exec races this code has the following complications: 1. check_unsafe_exec() returns -EBUSY if ->in_exec was already set by another thread. 2. do_execve_common() records "clear_in_exec" to ensure that the error path can only clear ->in_exec if it was set by current. However, after 9b1bf12d5d51 "signals: move cred_guard_mutex from task_struct to signal_struct" we do not need these complications: 1. We can't race with our sub-thread, this is called under per-process ->cred_guard_mutex. And we can't race with another CLONE_FS task, we already checked that this fs is not shared. We can remove the dead -EAGAIN logic. 2. "out_unmark:" in do_execve_common() is either called under ->cred_guard_mutex, or after de_thread() which kills other threads, so we can't race with sub-thread which could set ->in_exec. And if ->fs is shared with another process ->in_exec should be false anyway. We can clear in_exec unconditionally. This also means that check_unsafe_exec() can be void. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 81ae6212187a..389fe7b0ba14 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1223,11 +1223,10 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -static int check_unsafe_exec(struct linux_binprm *bprm) +static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; - int res = 0; if (p->ptrace) { if (p->ptrace & PT_PTRACE_CAP) @@ -1253,22 +1252,15 @@ static int check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); - if (p->fs->users > n_fs) { + if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; - } else { - res = -EAGAIN; - if (!p->fs->in_exec) { - p->fs->in_exec = 1; - res = 1; - } - } + else + p->fs->in_exec = 1; spin_unlock(&p->fs->lock); - - return res; } -/* - * Fill the binprm structure from the inode. +/* + * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). @@ -1453,7 +1445,6 @@ static int do_execve_common(const char *filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; - bool clear_in_exec; int retval; /* @@ -1485,10 +1476,7 @@ static int do_execve_common(const char *filename, if (retval) goto out_free; - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; + check_unsafe_exec(bprm); current->in_execve = 1; file = open_exec(filename); @@ -1558,8 +1546,7 @@ out_file: } out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; + current->fs->in_exec = 0; current->in_execve = 0; out_free: -- cgit v1.2.3 From 63e46b95e9eae1161832bf45cb40bbad37bfb182 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:51 -0800 Subject: exec: move the final allow_write_access/fput into free_bprm() Both success/failure paths cleanup bprm->file, we can move this code into free_bprm() to simlify and cleanup this logic. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Al Viro Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 389fe7b0ba14..f860866e04ba 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1138,9 +1138,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; - flush_signal_handlers(current, 0); do_close_on_exec(current->files); } @@ -1172,6 +1170,10 @@ void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); @@ -1424,12 +1426,6 @@ static int exec_binprm(struct linux_binprm *bprm) ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; proc_exec_connector(current); - - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; /* to catch use-after-free */ - } } return ret; @@ -1492,7 +1488,7 @@ static int do_execve_common(const char *filename, retval = bprm_mm_init(bprm); if (retval) - goto out_file; + goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1539,12 +1535,6 @@ out: mmput(bprm->mm); } -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - out_unmark: current->fs->in_exec = 0; current->in_execve = 0; -- cgit v1.2.3 From 98611e4e6a2b4a03fd2d4750cce8e4455a995c8d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jan 2014 15:55:52 -0800 Subject: exec: kill task_struct->did_exec We can kill either task->did_exec or PF_FORKNOEXEC, they are mutually exclusive. The patch kills ->did_exec because it has a single user. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index f860866e04ba..493b102a27c1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1424,7 +1424,6 @@ static int exec_binprm(struct linux_binprm *bprm) audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - current->did_exec = 1; proc_exec_connector(current); } -- cgit v1.2.3 From b88fae644e5e3922251a4b242f435f5e3b49c381 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 23 Jan 2014 15:55:57 -0800 Subject: exec: avoid propagating PF_NO_SETAFFINITY into userspace child Userspace process doesn't want the PF_NO_SETAFFINITY, but its parent may be a kernel worker thread which has PF_NO_SETAFFINITY set, and this worker thread can do kernel_thread() to create the child. Clearing this flag in usersapce child to enable its migrating capability. Signed-off-by: Zhang Yi Acked-by: Oleg Nesterov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 493b102a27c1..44218a7267b2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1087,8 +1087,8 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= - ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); current->personality &= ~bprm->per_clear; -- cgit v1.2.3 From 3b96d7db3b6dc99d207bca50037274d22e48dea5 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 23 Jan 2014 15:55:58 -0800 Subject: fs/exec.c: call arch_pick_mmap_layout() only once Currently both setup_new_exec() and flush_old_exec() issue a call to arch_pick_mmap_layout(). As setup_new_exec() and flush_old_exec() are always called pairwise arch_pick_mmap_layout() is called twice. This patch removes one call from setup_new_exec() to have it only called once. Signed-off-by: Richard Weinberger Tested-by: Pat Erley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 44218a7267b2..e1529b4c79b1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -842,7 +842,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); -- cgit v1.2.3