From c7e49c1488ab20342eaaf38f1ca35a207f4c051d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Aug 2010 18:03:07 -0700 Subject: ptrace: optimize exit_ptrace() for the likely case exit_ptrace() takes tasklist_lock unconditionally. We need this lock to avoid the race with ptrace_traceme(), it acts as a barrier. Change its caller, forget_original_parent(), to call exit_ptrace() under tasklist_lock. Change exit_ptrace() to drop and reacquire this lock if needed. This allows us to add the fastpath list_empty(ptraced) check. In the likely no-tracees case exit_ptrace() just returns and we avoid the lock() + unlock() sequence. "Zhang, Yanmin" suggested to add this check, and he reports that this change adds about 11% improvement in some tests. Suggested-and-tested-by: "Zhang, Yanmin" Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67b564a..671ed56e0a49 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father) struct task_struct *p, *n, *reaper; LIST_HEAD(dead_children); - exit_ptrace(father); - write_lock_irq(&tasklist_lock); + /* + * Note that exit_ptrace() and find_new_reaper() might + * drop tasklist_lock and reacquire it. + */ + exit_ptrace(father); reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { -- cgit v1.2.3 From f362b73244fb16ea4ae127ced1467dd8adaa7733 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 17 Aug 2010 23:56:55 +0100 Subject: Fix unprotected access to task credentials in waitid() Using a program like the following: #include #include #include #include int main() { id_t id; siginfo_t infop; pid_t res; id = fork(); if (id == 0) { sleep(1); exit(0); } kill(id, SIGSTOP); alarm(1); waitid(P_PID, id, &infop, WCONTINUED); return 0; } to call waitid() on a stopped process results in access to the child task's credentials without the RCU read lock being held - which may be replaced in the meantime - eliciting the following warning: =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/exit.c:1460 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by waitid02/22252: #0: (tasklist_lock){.?.?..}, at: [] do_wait+0xc5/0x310 #1: (&(&sighand->siglock)->rlock){-.-...}, at: [] wait_consider_task+0x19a/0xbe0 stack backtrace: Pid: 22252, comm: waitid02 Not tainted 2.6.35-323cd+ #3 Call Trace: [] lockdep_rcu_dereference+0xa4/0xc0 [] wait_consider_task+0xaf1/0xbe0 [] do_wait+0xf5/0x310 [] sys_waitid+0x86/0x1f0 [] ? child_wait_callback+0x0/0x70 [] system_call_fastpath+0x16/0x1b This is fixed by holding the RCU read lock in wait_task_continued() to ensure that the task's current credentials aren't destroyed between us reading the cred pointer and us reading the UID from those credentials. Furthermore, protect wait_task_stopped() in the same way. We don't need to keep holding the RCU read lock once we've read the UID from the credentials as holding the RCU read lock doesn't stop the target task from changing its creds under us - so the credentials may be outdated immediately after we've read the pointer, lock or no lock. Signed-off-by: Daniel J Blueman Signed-off-by: David Howells Acked-by: Paul E. McKenney Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 671ed56e0a49..03120229db28 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1386,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - /* don't need the RCU readlock here as we're holding a spinlock */ - uid = __task_cred(p)->uid; + uid = task_uid(p); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1460,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = __task_cred(p)->uid; + uid = task_uid(p); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); -- cgit v1.2.3 From 4e231c7962ce711c7d8c2a4dc23ecd1e8fc28363 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2010 21:01:59 +0200 Subject: perf: Fix up delayed_put_task_struct() I missed a perf_event_ctxp user when converting it to an array. Pull this last user into perf_event.c as well and fix it up. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..e2bdf37f9fde 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } -- cgit v1.2.3 From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fde..894179a32ec1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); -- cgit v1.2.3 From d16e15f5b029fc7d03540ba0e5fb23b0abb0ebe0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:10 -0700 Subject: exit: add lock context annotation on find_new_reaper() find_new_reaper() releases and regrabs tasklist_lock but was missing proper annotations. Add it. This remove following sparse warning: warning: context imbalance in 'find_new_reaper' - unexpected unlock Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 894179a32ec1..b194febf5799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -703,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) * space. */ static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; -- cgit v1.2.3 From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 5 Nov 2010 16:53:42 +0100 Subject: posix-cpu-timers: workaround to suppress the problems with mt exec posix-cpu-timers.c correctly assumes that the dying process does posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD timers from signal->cpu_timers list. But, it also assumes that timer->it.cpu.task is always the group leader, and thus the dead ->task means the dead thread group. This is obviously not true after de_thread() changes the leader. After that almost every posix_cpu_timer_ method has problems. It is not simple to fix this bug correctly. First of all, I think that timer->it.cpu should use struct pid instead of task_struct. Also, the locking should be reworked completely. In particular, tasklist_lock should not be used at all. This all needs a lot of nontrivial and hard-to-test changes. Change __exit_signal() to do posix_cpu_timers_exit_group() when the old leader dies during exec. This is not the fix, just the temporary hack to hide the problem for 2.6.37 and stable. IOW, this is obviously wrong but this is what we currently have anyway: cpu timers do not work after mt exec. In theory this change adds another race. The exiting leader can detach the timers which were attached to the new leader. However, the window between de_thread() and release_task() is small, we can pretend that sys_timer_create() was called before de_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index b194febf5799..21aa7b3001fb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.3 From 33dd94ae1ccbfb7bf0fb6c692bc3d1c4269e6177 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Thu, 2 Dec 2010 14:31:21 -0800 Subject: do_exit(): make sure that we run with get_fs() == USER_DS If a user manages to trigger an oops with fs set to KERNEL_DS, fs is not otherwise reset before do_exit(). do_exit may later (via mm_release in fork.c) do a put_user to a user-controlled address, potentially allowing a user to leverage an oops into a controlled write into kernel memory. This is only triggerable in the presence of another bug, but this potentially turns a lot of DoS bugs into privilege escalations, so it's worth fixing. I have proof-of-concept code which uses this bug along with CVE-2010-3849 to write a zero to an arbitrary kernel address, so I've tested that this is not theoretical. A more logical place to put this fix might be when we know an oops has occurred, before we call do_exit(), but that would involve changing every architecture, in multiple places. Let's just stick it in do_exit instead. [akpm@linux-foundation.org: update code comment] Signed-off-by: Nelson Elhage Cc: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 21aa7b3001fb..676149a4ac5f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); -- cgit v1.2.3