From 7bd36014460f793c19e7d6c94dab67b0afcfcb7f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Sep 2013 16:50:56 -0700 Subject: timekeeping: Fix HRTICK related deadlock from ntp lock changes Gerlando Falauto reported that when HRTICK is enabled, it is possible to trigger system deadlocks. These were hard to reproduce, as HRTICK has been broken in the past, but seemed to be connected to the timekeeping_seq lock. Since seqlock/seqcount's aren't supported w/ lockdep, I added some extra spinlock based locking and triggered the following lockdep output: [ 15.849182] ntpd/4062 is trying to acquire lock: [ 15.849765] (&(&pool->lock)->rlock){..-...}, at: [] __queue_work+0x145/0x480 [ 15.850051] [ 15.850051] but task is already holding lock: [ 15.850051] (timekeeper_lock){-.-.-.}, at: [] do_adjtimex+0x7f/0x100 [ 15.850051] Chain exists of: &(&pool->lock)->rlock --> &p->pi_lock --> timekeeper_lock [ 15.850051] Possible unsafe locking scenario: [ 15.850051] [ 15.850051] CPU0 CPU1 [ 15.850051] ---- ---- [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&p->pi_lock); [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&(&pool->lock)->rlock); [ 15.850051] [ 15.850051] *** DEADLOCK *** The deadlock was introduced by 06c017fdd4dc48451a ("timekeeping: Hold timekeepering locks in do_adjtimex and hardpps") in 3.10 This patch avoids this deadlock, by moving the call to schedule_delayed_work() outside of the timekeeper lock critical section. Reported-by: Gerlando Falauto Tested-by: Lin Ming Signed-off-by: John Stultz Cc: Mathieu Desnoyers Cc: stable #3.11, 3.10 Link: http://lkml.kernel.org/r/1378943457-27314-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 6 ++---- kernel/time/timekeeping.c | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577b..bb2215174f05 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif @@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc2..947ba25a95a0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc) write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ntp_notify_cmos_timer(); + return ret; } -- cgit v1.2.3 From 6c9a27f5da9609fca46cb2b183724531b48f71ad Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 10 Sep 2013 18:16:36 +0900 Subject: sched/fair: Fix small race where child->se.parent,cfs_rq might point to invalid ones There is a small race between copy_process() and cgroup_attach_task() where child->se.parent,cfs_rq points to invalid (old) ones. parent doing fork() | someone moving the parent to another cgroup -------------------------------+--------------------------------------------- copy_process() + dup_task_struct() -> parent->se is copied to child->se. se.parent,cfs_rq of them point to old ones. cgroup_attach_task() + cgroup_task_migrate() -> parent->cgroup is updated. + cpu_cgroup_attach() + sched_move_task() + task_move_group_fair() +- set_task_rq() -> se.parent,cfs_rq of parent are updated. + cgroup_fork() -> parent->cgroup is copied to child->cgroup. (*1) + sched_fork() + task_fork_fair() -> se.parent,cfs_rq of child are accessed while they point to old ones. (*2) In the worst case, this bug can lead to "use-after-free" and cause a panic, because it's new cgroup's refcount that is incremented at (*1), so the old cgroup(and related data) can be freed before (*2). In fact, a panic caused by this bug was originally caught in RHEL6.4. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] sched_slice+0x6e/0xa0 [...] Call Trace: [] place_entity+0x75/0xa0 [] task_fork_fair+0xaa/0x160 [] sched_fork+0x6b/0x140 [] copy_process+0x5b2/0x1450 [] ? wake_up_new_task+0xd9/0x130 [] do_fork+0x94/0x460 [] ? sys_wait4+0xae/0x100 [] sys_clone+0x28/0x30 [] stub_clone+0x13/0x20 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: Daisuke Nishimura Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/039601ceae06$733d3130$59b79390$@mxp.nes.nec.co.jp Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b3fe1cd8f40..11cd13667359 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5928,11 +5928,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); -- cgit v1.2.3 From fc840914e9b07ab4685c195e1e54e58de4f84c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Sep 2013 13:01:41 +0200 Subject: sched/debug: Take PID namespace into account Emmanuel reported that /proc/sched_debug didn't report the right PIDs when using namespaces, cure this. Reported-by: Emmanuel Deloget Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130909110141.GM31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e076bddd4c66..196559994f7c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " "); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, + p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -289,7 +289,7 @@ do { \ P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); - P(curr->pid); + SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); P(cpu_load[0]); P(cpu_load[1]); @@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" -- cgit v1.2.3 From 13b62e46d5407c7d619aea1dc9c3e0991b631b57 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Sep 2013 11:30:36 +0300 Subject: sched: Fix comment for sched_info_depart sched_info_depart seems to be only called from sched_info_switch(), so only on involuntary task switch. Fix the comment to match. Signed-off-by: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: KOSAKI Motohiro Link: http://lkml.kernel.org/r/20130916083036.GA1113@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/stats.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5aef494fc8b4..c7edee71bce8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t) } /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. -- cgit v1.2.3