From 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 26 May 2010 14:42:51 -0700 Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node() Some workloads that create a large number of small files tend to assign too many pages to node 0 (multi-node systems). Part of the reason is that the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at node 0 for newly created tasks. This patch changes the rotor to be initialized to a random node number of the cpuset. [akpm@linux-foundation.org: fix layout] [Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration] Signed-off-by: Jack Steiner Signed-off-by: Lee Schermerhorn Cc: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 4d57d9e3a6e9..2e9cc3139ec6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1079,6 +1079,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = node_random(p->mems_allowed); + p->cpuset_slab_spread_rotor = node_random(p->mems_allowed); +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -- cgit v1.2.3 From 4dec2a91fd7e8815d730afbfdcf085cbf53433ac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:15 -0700 Subject: fork/exit: move tty_kref_put() outside of __cleanup_signal() tty_kref_put() has two callsites in copy_process() paths, 1. if copy_process() suceeds it is called before we copy signal->tty from parent 2. otherwise it is called from __cleanup_signal() under bad_fork_cleanup_signal: label In both cases tty_kref_put() is not right and unneeded because we don't have the balancing tty_kref_get(). Fortunately, this is harmless because this can only happen without CLONE_THREAD, and in this case signal->tty must be NULL. Remove tty_kref_put() from copy_process() and __cleanup_signal(), and change another caller of __cleanup_signal(), __exit_signal(), to call tty_kref_put() by hand. I hope this change makes sense by itself, but it is also needed to make ->signal refcountable. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox Acked-by: Roland McGrath Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 2e9cc3139ec6..b7879ef6e7cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -892,7 +892,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { thread_group_cputime_free(sig); - tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -1263,7 +1262,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); -- cgit v1.2.3 From ea6d290ca34c4fd91b7348338c0cc7bdeff94a35 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:16 -0700 Subject: signals: make task_struct->signal immutable/refcountable We have a lot of problems with accessing task_struct->signal, it can "disappear" at any moment. Even current can't use its ->signal safely after exit_notify(). ->siglock helps, but it is not convenient, not always possible, and sometimes it makes sense to use task->signal even after this task has already dead. This patch adds the reference counter, sigcnt, into signal_struct. This reference is owned by task_struct and it is dropped in __put_task_struct(). Perhaps it makes sense to export get/put_signal_struct() later, but currently I don't see the immediate reason. Rename __cleanup_signal() to free_signal_struct() and unexport it. With the previous changes it does nothing except kmem_cache_free(). Change __exit_signal() to not clear/free ->signal, it will be freed when the last reference to any thread in the thread group goes away. Note: - when the last thead exits signal->tty can point to nowhere, see the next patch. - with or without this patch signal_struct->count should go away, or at least it should be "int nr_threads" for fs/proc. This will be addressed later. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index b7879ef6e7cd..e08e3012cd6b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static inline void free_signal_struct(struct signal_struct *sig) +{ + thread_group_cputime_free(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void put_signal_struct(struct signal_struct *sig) +{ + if (atomic_dec_and_test(&sig->sigcnt)) + free_signal_struct(sig); +} + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); @@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); + put_signal_struct(tsk->signal); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -864,6 +877,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; + atomic_set(&sig->sigcnt, 1); atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); @@ -889,12 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -void __cleanup_signal(struct signal_struct *sig) -{ - thread_group_cputime_free(sig); - kmem_cache_free(signal_cachep, sig); -} - static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1248,6 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->sigcnt); atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); p->group_leader = current->group_leader; @@ -1294,7 +1303,7 @@ bad_fork_cleanup_mm: mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - __cleanup_signal(p->signal); + free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: -- cgit v1.2.3 From a705be6b5e8b05f2ae51536ec709de921960326c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:19 -0700 Subject: kill the obsolete thread_group_cputime_free() helper Kill the empty thread_group_cputime_free() helper. It was needed to free the per-cpu data which we no longer have. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index e08e3012cd6b..58f8611b1ac6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -167,7 +167,6 @@ EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { - thread_group_cputime_free(sig); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From 97101eb41d0d3c97543878ce40e0b8a8b2747ed7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:20 -0700 Subject: exit: move taskstats_tgid_free() from __exit_signal() to free_signal_struct() Move taskstats_tgid_free() from __exit_signal() to free_signal_struct(). This way signal->stats never points to nowhere and we can read ->stats lockless. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 58f8611b1ac6..7701470ea1b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -167,6 +167,7 @@ EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From 6e1be45aa6ba6a36c0312f65ecf311135c73001d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:21 -0700 Subject: check_unshare_flags: kill the bogus CLONE_SIGHAND/sig->count check check_unshare_flags(CLONE_SIGHAND) adds CLONE_THREAD to *flags_ptr if the task is multithreaded to ensure unshare_thread() will fail. Not only this is a bit strange way to return the error, this is absolutely meaningless. If signal->count > 1 then sighand->count must be also > 1, and unshare_sighand() will fail anyway. In fact, all CLONE_THREAD/SIGHAND/VM checks inside sys_unshare() do not look right. Fortunately this code doesn't really work anyway. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Acked-by: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 7701470ea1b8..40cd099cfc1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,14 +1517,6 @@ static void check_unshare_flags(unsigned long *flags_ptr) if (*flags_ptr & CLONE_VM) *flags_ptr |= CLONE_SIGHAND; - /* - * If unsharing signal handlers and the task was created - * using CLONE_THREAD, then must unshare the thread - */ - if ((*flags_ptr & CLONE_SIGHAND) && - (atomic_read(¤t->signal->count) > 1)) - *flags_ptr |= CLONE_THREAD; - /* * If unsharing namespace, must also unshare filesystem information. */ -- cgit v1.2.3 From b3ac022cb9dc5883505a88b159d1b240ad1ef405 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:24 -0700 Subject: proc: turn signal_struct->count into "int nr_threads" No functional changes, just s/atomic_t count/int nr_threads/. With the recent changes this counter has a single user, get_nr_threads() And, none of its callers need the really accurate number of threads, not to mention each caller obviously races with fork/exit. It is only used to report this value to the user-space, except first_tid() uses it to avoid the unnecessary while_each_thread() loop in the unlikely case. It is a bit sad we need a word in struct signal_struct for this, perhaps we can change get_nr_threads() to approximate the number of threads using signal->live and kill ->nr_threads later. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 40cd099cfc1b..d32410bd4be7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -877,9 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->sigcnt, 1); - atomic_set(&sig->count, 1); + sig->nr_threads = 1; atomic_set(&sig->live, 1); + atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; @@ -1256,9 +1256,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->sigcnt); - atomic_inc(¤t->signal->count); + current->signal->nr_threads++; atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } -- cgit v1.2.3 From f106eee10038c2ee5b6056aaf3f6d5229be6dcdd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:44:11 -0700 Subject: pids: fix fork_idle() to setup ->pids correctly copy_process(pid => &init_struct_pid) doesn't do attach_pid/etc. It shouldn't, but this means that the idle threads run with the wrong pids copied from the caller's task_struct. In x86 case the caller is either kernel_init() thread or keventd. In particular, this means that after the series of cpu_up/cpu_down an idle thread (which never exits) can run with .pid pointing to nowhere. Change fork_idle() to initialize idle->pids[] correctly. We only set .pid = &init_struct_pid but do not add .node to list, INIT_TASK() does the same for the boot-cpu idle thread (swapper). Signed-off-by: Oleg Nesterov Cc: Cedric Le Goater Cc: Dave Hansen Cc: Eric Biederman Cc: Herbert Poetzl Cc: Mathias Krause Acked-by: Roland McGrath Acked-by: Serge Hallyn Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d32410bd4be7..bf9fef6d1bfe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1338,6 +1338,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re return regs; } +static inline void init_idle_pids(struct pid_link *links) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&links[type].node); /* not really needed */ + links[type].pid = &init_struct_pid; + } +} + struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; @@ -1345,8 +1355,10 @@ struct task_struct * __cpuinit fork_idle(int cpu) task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid, 0); - if (!IS_ERR(task)) + if (!IS_ERR(task)) { + init_idle_pids(task->pids); init_idle(task, cpu); + } return task; } -- cgit v1.2.3 From 35926ff5fba8245bd1c6ac04155048f6f89232b1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 30 May 2010 09:00:03 -0700 Subject: Revert "cpusets: randomize node rotor used in cpuset_mem_spread_node()" This reverts commit 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b, which caused cross-architecture build problems for all the wrong reasons. IA64 already added its own version of __node_random(), but the fact is, there is nothing architectural about the function, and the original commit was just badly done. Revert it, since no fix is forthcoming. Requested-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bf9fef6d1bfe..b6cce14ba047 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,10 +1086,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif -#ifdef CONFIG_CPUSETS - p->cpuset_mem_spread_rotor = node_random(p->mems_allowed); - p->cpuset_slab_spread_rotor = node_random(p->mems_allowed); -#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -- cgit v1.2.3