summaryrefslogtreecommitdiff
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c216
1 files changed, 138 insertions, 78 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..8c162d102740 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -71,6 +73,7 @@
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -147,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
- THREAD_SIZE_ORDER);
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -237,6 +240,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
@@ -283,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -311,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
@@ -361,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->mmap_cache = NULL;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@ -417,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_dec(&inode->i_writecount);
mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -523,27 +535,57 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
+ clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
- mmu_notifier_mm_init(mm);
- return mm;
}
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
free_mm(mm);
return NULL;
}
@@ -559,9 +601,8 @@ static void check_mm(struct mm_struct *mm)
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON(mm->pmd_huge_pte);
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -577,7 +618,6 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@ -799,30 +839,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
- if (!oldmm)
- return NULL;
-
mm = allocate_mm();
if (!mm)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- mm->pmd_huge_pte = NULL;
-#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@ -844,15 +874,6 @@ free_pt:
fail_nomem:
return NULL;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -878,6 +899,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -1034,6 +1058,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
@@ -1066,13 +1095,37 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
+static void copy_seccomp(struct task_struct *p)
{
- unsigned long new_flags = p->flags;
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
- new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
- new_flags |= PF_FORKNOEXEC;
- p->flags = new_flags;
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1086,18 +1139,12 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&p->pi_waiters);
+ p->pi_waiters = RB_ROOT;
+ p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
#endif
}
-#ifdef CONFIG_MM_OWNER
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
- mm->owner = p;
-}
-#endif /* CONFIG_MM_OWNER */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1171,7 +1218,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* do not allow it to share a thread group or signal handlers or
* parent with the forking task.
*/
- if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
+ if (clone_flags & CLONE_SIGHAND) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
@@ -1188,7 +1235,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1221,9 +1267,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1254,9 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time);
- p->real_start_time = p->start_time;
- monotonic_to_bootbased(&p->real_start_time);
+ p->start_time = ktime_get_ns();
+ p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
@@ -1267,9 +1312,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
+ goto bad_fork_cleanup_threadgroup_lock;
}
- mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
@@ -1300,25 +1344,24 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
-#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(clone_flags, p);
+ retval = sched_fork(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_cleanup_perf;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1402,13 +1445,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->tgid = p->pid;
}
- p->pdeath_signal = 0;
- p->exit_state = 0;
-
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
+ p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
@@ -1430,6 +1471,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_lock(&current->sighand->siglock);
/*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
@@ -1471,6 +1518,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
atomic_inc(&current->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
@@ -1478,7 +1527,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
@@ -1514,15 +1565,15 @@ bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, 0);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
@@ -1598,10 +1649,12 @@ long do_fork(unsigned long clone_flags,
*/
if (!IS_ERR(p)) {
struct completion vfork;
+ struct pid *pid;
trace_sched_process_fork(current, p);
- nr = task_pid_vnr(p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1616,12 +1669,14 @@ long do_fork(unsigned long clone_flags,
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
- ptrace_event(trace, nr);
+ ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
- ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
} else {
nr = PTR_ERR(p);
}
@@ -1644,7 +1699,7 @@ SYSCALL_DEFINE0(fork)
return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
/* can not support in nommu mode */
- return(-EINVAL);
+ return -EINVAL;
#endif
}
#endif
@@ -1652,7 +1707,7 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
0, NULL, NULL);
}
#endif
@@ -1859,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);