diff options
Diffstat (limited to 'kernel/fork.c')
| -rw-r--r-- | kernel/fork.c | 216 |
1 files changed, 138 insertions, 78 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be9548c..8c162d102740 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> +#include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -71,6 +73,7 @@ #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> +#include <linux/compiler.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -147,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -237,6 +240,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -283,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -311,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@ -361,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->mmap_cache = NULL; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -417,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -523,27 +535,57 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); + clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; - mmu_notifier_mm_init(mm); - return mm; } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -559,9 +601,8 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -577,7 +618,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -799,30 +839,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; - if (!oldmm) - return NULL; - mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -844,15 +874,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -878,6 +899,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; @@ -1034,6 +1058,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1066,13 +1095,37 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) +static void copy_seccomp(struct task_struct *p) { - unsigned long new_flags = p->flags; +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + assert_spin_locked(¤t->sighand->siglock); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) @@ -1086,18 +1139,12 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); + p->pi_waiters = RB_ROOT; + p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; #endif } -#ifdef CONFIG_MM_OWNER -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MM_OWNER */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1171,7 +1218,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * do not allow it to share a thread group or signal handlers or * parent with the forking task. */ - if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if (clone_flags & CLONE_SIGHAND) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) @@ -1188,7 +1235,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1221,9 +1267,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); @@ -1254,9 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) @@ -1267,9 +1312,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; + goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; @@ -1300,25 +1344,24 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(clone_flags, p); + retval = sched_fork(clone_flags, p); + if (retval) + goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1402,13 +1445,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->tgid = p->pid; } - p->pdeath_signal = 0; - p->exit_state = 0; - p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; + p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; @@ -1430,6 +1471,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@ -1471,6 +1518,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; @@ -1478,7 +1527,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) @@ -1514,15 +1565,15 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: +bad_fork_cleanup_threadgroup_lock: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: @@ -1598,10 +1649,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1616,12 +1669,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } @@ -1644,7 +1699,7 @@ SYSCALL_DEFINE0(fork) return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ - return(-EINVAL); + return -EINVAL; #endif } #endif @@ -1652,7 +1707,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif @@ -1859,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); |
