diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 164 |
1 files changed, 110 insertions, 54 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46cd87d..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -315,12 +322,20 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); @@ -365,12 +380,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -419,9 +433,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -431,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* @@ -527,19 +541,37 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +581,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm; +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -569,9 +607,8 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -587,7 +624,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -819,17 +855,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -851,15 +880,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1002,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); + /* + * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * without an RCU grace period, see __lock_task_sighand(). + */ kmem_cache_free(sighand_cachep, sighand); } } - /* * Initialize POSIX timer handling for a thread group. */ @@ -1053,6 +1076,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1081,6 +1105,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + assert_spin_locked(¤t->sighand->siglock); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1095,17 +1152,9 @@ static void rt_mutex_init_task(struct task_struct *p) p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; - p->pi_top_task = NULL; #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1196,7 +1245,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1262,9 +1310,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) @@ -1307,10 +1354,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1326,8 +1369,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1437,6 +1481,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@ -1525,8 +1575,9 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: @@ -1873,6 +1924,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); |