From 971c90bfa2f0b4fe52d6d9002178d547706f1343 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:25:35 -0700 Subject: alarmtimers: Avoid possible null pointer traversal We don't check if old_setting is non null before assigning it, so correct this. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 59f369f98a04..1dee3f62a6a7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -479,11 +479,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; - /* Save old values */ - old_setting->it_interval = - ktime_to_timespec(timr->it.alarmtimer.period); - old_setting->it_value = - ktime_to_timespec(timr->it.alarmtimer.node.expires); + if (old_setting) + alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ alarm_cancel(&timr->it.alarmtimer); -- cgit v1.2.3 From ea7802f630d356acaf66b3c0b28c00a945fc35dc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:51:56 -0700 Subject: alarmtimers: Memset itimerspec passed into alarm_timer_get Following common_timer_get, zero out the itimerspec passed in. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1dee3f62a6a7..0e9263f6fd09 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer) static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { + memset(cur_setting, 0, sizeof(struct itimerspec)); + cur_setting->it_interval = ktime_to_timespec(timr->it.alarmtimer.period); cur_setting->it_value = -- cgit v1.2.3 From 6af7e471e5a7746b8024d70b4363d3dfe41d36b8 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 10:26:09 -0700 Subject: alarmtimers: Avoid possible denial of service with high freq periodic timers Its possible to jam up the alarm timers by setting very small interval timers, which will cause the alarmtimer subsystem to spend all of its time firing and restarting timers. This can effectivly lock up a box. A deeper fix is needed, closely mimicking the hrtimer code, but for now just cap the interval to 100us to avoid userland hanging the system. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0e9263f6fd09..ea5e1a928d5b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -481,6 +481,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; + /* + * XXX HACK! Currently we can DOS a system if the interval + * period on alarmtimers is too small. Cap the interval here + * to 100us and solve this properly in a future patch! -jstultz + */ + if ((new_setting->it_interval.tv_sec == 0) && + (new_setting->it_interval.tv_nsec < 100000)) + new_setting->it_interval.tv_nsec = 100000; + if (old_setting) alarm_timer_get(timr, old_setting); -- cgit v1.2.3 From c59d87c460767bc35dafd490139d3cfe78fb8da4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Aug 2011 16:21:35 -0500 Subject: xfs: remove subdirectories Use the move from Linux 2.6 to Linux 3.x as an excuse to kill the annoying subdirectories in the XFS source code. Besides the large amount of file rename the only changes are to the Makefile, a few files including headers with the subdirectory prefix, and the binary sysctl compat code that includes a header under fs/xfs/ from kernel/. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- kernel/sysctl_binary.c | 2 +- kernel/sysctl_check.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 3b8e028b9601..e8bffbe2ba4b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 4e4932a7b360..362da653813d 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include -- cgit v1.2.3 From 69dd3d8e29e294caaf63eb5e8a72d250279f9e5f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Aug 2011 10:36:51 -0700 Subject: Revert "irq: Always set IRQF_ONESHOT if no primary handler is specified" This reverts commit f3637a5f2e2eb391ff5757bc83fb5de8f9726464. It turns out that this breaks several drivers, one example being OMAP boards which use the on-board OMAP UARTs and the omap-serial driver that will not boot to userspace after the commit. Paul Walmsley reports that enabling CONFIG_DEBUG_SHIRQ reveals 'IRQ handler type mismatch' errors: IRQ handler type mismatch for IRQ 74 current handler: serial idle ... and the reason is that setting IRQF_ONESHOT will now result in those interrupt handlers having different IRQF flags, and thus being unsharable. So the commit log in the reverted commit: "Since it is required for those users and there is no difference for others it makes sense to add this flag unconditionally." is simply not true: there may not be any difference from a "actions at irq time", but there is a *big* difference wrt this flag testing irq management (see __setup_irq() in kernel/irq/manage.c). One solution may be to stop verifying IRQF_ONESHOT in __setup_irq(), but right now the safe course of action is to revert the change. Let's revisit this in a later merge window. Reported-by: Paul Walmsley Cc: Sebastian Andrzej Siewior Requested-by: Alan Cox Acked-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2e9425889fa8..9b956fa20308 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1331,7 +1331,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; - irqflags |= IRQF_ONESHOT; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); -- cgit v1.2.3 From be27425dcc516fd08245b047ea57f83b8f6f0903 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Aug 2011 16:15:10 -0700 Subject: Add a personality to report 2.6.x version numbers I ran into a couple of programs which broke with the new Linux 3.0 version. Some of those were binary only. I tried to use LD_PRELOAD to work around it, but it was quite difficult and in one case impossible because of a mix of 32bit and 64bit executables. For example, all kind of management software from HP doesnt work, unless we pretend to run a 2.6 kernel. $ uname -a Linux svivoipvnx001 3.0.0-08107-g97cd98f #1062 SMP Fri Aug 12 18:11:45 CEST 2011 i686 i686 i386 GNU/Linux $ hpacucli ctrl all show Error: No controllers detected. $ rpm -qf /usr/sbin/hpacucli hpacucli-8.75-12.0 Another notable case is that Python now reports "linux3" from sys.platform(); which in turn can break things that were checking sys.platform() == "linux2": https://bugzilla.mozilla.org/show_bug.cgi?id=664564 It seems pretty clear to me though it's a bug in the apps that are using '==' instead of .startswith(), but this allows us to unbreak broken programs. This patch adds a UNAME26 personality that makes the kernel report a 2.6.40+x version number instead. The x is the x in 3.x. I know this is somewhat ugly, but I didn't find a better workaround, and compatibility to existing programs is important. Some programs also read /proc/sys/kernel/osrelease. This can be worked around in user space with mount --bind (and a mount namespace) To use: wget ftp://ftp.kernel.org/pub/linux/kernel/people/ak/uname26/uname26.c gcc -o uname26 uname26.c ./uname26 program Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sys.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index dd948a1fca4c..18ee1d2f6474 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,8 @@ #include #include +/* Move somewhere else to avoid recompiling? */ +#include #include #include @@ -1161,6 +1165,34 @@ DECLARE_RWSEM(uts_sem); #define override_architecture(name) 0 #endif +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, int len) +{ + int ret = 0; + char buf[len]; + + if (current->personality & UNAME26) { + char *rest = UTS_RELEASE; + int ndots = 0; + unsigned v; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + snprintf(buf, len, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, len); + } + return ret; +} + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1170,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) errno = -EFAULT; up_read(&uts_sem); + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; if (!errno && override_architecture(name)) errno = -EFAULT; return errno; @@ -1191,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) error = -EFAULT; up_read(&uts_sem); + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; if (!error && override_architecture(name)) error = -EFAULT; return error; @@ -1225,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) if (!error && override_architecture(name)) error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; return error ? -EFAULT : 0; } #endif -- cgit v1.2.3 From 4c30c6f566c0989ddaee3407da44751e340a63ed Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 25 Aug 2011 15:59:11 -0700 Subject: kernel/printk: do not turn off bootconsole in printk_late_init() if keep_bootcon It seems that 7bf693951a8e ("console: allow to retain boot console via boot option keep_bootcon") doesn't always achieve what it aims, as when printk_late_init() runs it unconditionally turns off all boot consoles. With this patch, I am able to see more messages on the boot console in KVM guests than I can without, when keep_bootcon is specified. I think it is appropriate for the relevant -stable trees. However, it's more of an annoyance than a serious bug (ideally you don't need to keep the boot console around as console handover should be working -- I was encountering a situation where the console handover wasn't working and not having the boot console available meant I couldn't see why). Signed-off-by: Nishanth Aravamudan Cc: David S. Miller Cc: Alan Cox Cc: Greg KH Acked-by: Fabio M. Di Nitto Cc: [2.6.39.x, 3.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 836a2ae0ac31..28a40d8171b8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1604,7 +1604,7 @@ static int __init printk_late_init(void) struct console *con; for_each_console(con) { - if (con->flags & CON_BOOT) { + if (!keep_bootcon && con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", con->name, con->index); unregister_console(con); -- cgit v1.2.3 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fef..a9a5de07c4f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); -- cgit v1.2.3 From c259e01a1ec90063042f758e409cd26b2a0963c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:00 +0200 Subject: sched: Separate the scheduler entry for preemption Block-IO and workqueues call into notifier functions from the scheduler core code with interrupts and preemption disabled. These calls should be made before entering the scheduler core. To simplify this, separate the scheduler core code into __schedule(). __schedule() is directly called from the places which set PREEMPT_ACTIVE and from schedule(). This allows us to add the work checks into schedule(), so they are only called when a task voluntary goes to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/r/20110622174918.813258321@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..ec15e8129cf7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4279,9 +4279,9 @@ pick_next_task(struct rq *rq) } /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4369,6 +4369,11 @@ need_resched: if (need_resched()) goto need_resched; } + +asmlinkage void schedule(void) +{ + __schedule(); +} EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -4435,7 +4440,7 @@ asmlinkage void __sched notrace preempt_schedule(void) do { add_preempt_count_notrace(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4463,7 +4468,7 @@ asmlinkage void __sched preempt_schedule_irq(void) do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); @@ -5588,7 +5593,7 @@ static inline int should_resched(void) static void __cond_resched(void) { add_preempt_count(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count(PREEMPT_ACTIVE); } -- cgit v1.2.3 From 9c40cef2b799f9b5e7fa5de4d2ad3a0168ba118c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:01 +0200 Subject: sched: Move blk_schedule_flush_plug() out of __schedule() There is no real reason to run blk_schedule_flush_plug() with interrupts and preemption disabled. Move it into schedule() and call it when the task is going voluntarily to sleep. There might be false positives when the task is woken between that call and actually scheduling, but that's not really different from being woken immediately after switching away. This fixes a deadlock in the scheduler where the blk_schedule_flush_plug() callchain enables interrupts and thereby allows a wakeup to happen of the task that's going to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/n/tip-dwfxtra7yg1b5r65m32ywtct@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec15e8129cf7..511732c39b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4322,16 +4322,6 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - - /* - * If we are going to sleep and we have plugged IO - * queued, make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_schedule_flush_plug(prev); - raw_spin_lock(&rq->lock); - } } switch_count = &prev->nvcsw; } @@ -4370,8 +4360,23 @@ need_resched: goto need_resched; } +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + asmlinkage void schedule(void) { + struct task_struct *tsk = current; + + sched_submit_work(tsk); __schedule(); } EXPORT_SYMBOL(schedule); -- cgit v1.2.3 From feff8fa0075bdfd43c841e9d689ed81adda988d6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 18 Aug 2011 20:36:57 +0800 Subject: sched: Fix a memory leak in __sdt_free() This patch fixes the following memory leak: unreferenced object 0xffff880107266800 (size 512): comm "sched-powersave", pid 3718, jiffies 4323097853 (age 27495.450s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] create_object+0x187/0x28b [] kmemleak_alloc+0x73/0x98 [] __kmalloc_node+0x104/0x159 [] kzalloc_node.clone.97+0x15/0x17 [] build_sched_domains+0xb7/0x7f3 [] partition_sched_domains+0x1db/0x24a [] do_rebuild_sched_domains+0x3b/0x47 [] rebuild_sched_domains+0x10/0x12 [] sched_power_savings_store+0x6c/0x7b [] sched_mc_power_savings_store+0x16/0x18 [] sysdev_class_store+0x20/0x22 [] sysfs_write_file+0x108/0x144 [] vfs_write+0xaf/0x102 [] sys_write+0x4d/0x74 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff Signed-off-by: WANG Cong Signed-off-by: Peter Zijlstra Cc: stable@kernel.org # 3.0 Link: http://lkml.kernel.org/r/1313671017-4112-1-git-send-email-amwang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 511732c39b6e..c79e7c63a4aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7453,6 +7453,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); if (sd && (sd->flags & SD_OVERLAP)) free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } -- cgit v1.2.3 From a8d757ef076f0f95f13a918808824058de25b3eb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 25 Aug 2011 15:58:03 +0200 Subject: perf events: Fix slow and broken cgroup context switch code The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched.c | 2 +- 2 files changed, 54 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b8785e26ee1c..45847fbb599a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* + * next is NULL when called from perf_event_enable_on_exec() + * that will systematically cause a cgroup_switch() + */ + if (next) + cgrp2 = perf_cgroup_from_task(next); + + /* + * only schedule out current cgroup events if we know + * that we are switching to a different cgroup. Otherwise, + * do no touch the cgroup events. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* prev can never be NULL */ + cgrp2 = perf_cgroup_from_task(prev); + + /* + * only need to schedule in cgroup events if we are changing + * cgroup during ctxsw. Cgroup events were not scheduled + * out of ctxsw out if that was not the case. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWIN); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { } @@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); + perf_cgroup_sched_out(task, next); } static void task_ctx_sched_out(struct perf_event_context *ctx) @@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void __perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task) * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); + perf_cgroup_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * ctxswin cgroup events which are already scheduled * in. */ - perf_cgroup_sched_out(current); + perf_cgroup_sched_out(current, NULL); raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0408cdc6d572 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - perf_event_task_sched_in(current); + perf_event_task_sched_in(prev, current); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -- cgit v1.2.3 From 7f310a5d4e8525ac0cc2f58c973d2100ce034410 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:38 -0400 Subject: perf_event: Fix broken calc_timer_values() We detected a serious issue with PERF_SAMPLE_READ and timing information when events were being multiplexing. Samples would have time_running > time_enabled. That was easy to reproduce with a libpfm4 example (ran 3 times to cause multiplexing on Core 2): $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & IIP:0x0000000040062d ... PERIOD:2355332948 ENA=40144625315 RUN=60014875184 syst_smpl: WARNING: time_running > time_enabled 63277537998 uops_retired:freq=1 , scaled The bug was not present in kernel up to (and including) 3.0. It turns out the bug was introduced by the following commit: commit c4794295917ebeda8013b6cb9c8d71ab4f74a1fa events: Move lockless timer calculation into helper function The parameters of the function got reversed yet the call sites were not updated to reflect the change. That lead to time_running and time_enabled being swapped. That had no effect when there was no multiplexing because in that case time_running = time_enabled but it would show up in any other scenario. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110829124112.GA4828@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 45847fbb599a..0f857782d06f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3396,8 +3396,8 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, - u64 *running, - u64 *enabled) + u64 *enabled, + u64 *running) { u64 now, ctx_time; -- cgit v1.2.3 From ed585a651681e822089087b426e6ebfb6d3d9873 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 11 Sep 2011 13:59:27 +0200 Subject: genirq: Make irq_shutdown() symmetric vs. irq_startup again If an irq_chip provides .irq_shutdown(), but neither of .irq_disable() or .irq_mask(), free_irq() crashes when jumping to NULL. Fix this by only trying .irq_disable() and .irq_mask() if there's no .irq_shutdown() provided. This revives the symmetry with irq_startup(), which tries .irq_startup(), .irq_enable(), and irq_unmask(), and makes it consistent with the comment for irq_chip.irq_shutdown() in , which says: * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) This is also how __free_irq() behaved before the big overhaul, cfr. e.g. 3b56f0585fd4c02d047dc406668cb40159b2d340 ("genirq: Remove bogus conditional"), where the core interrupt code always overrode .irq_shutdown() to .irq_disable() if .irq_shutdown() was NULL. Signed-off-by: Geert Uytterhoeven Cc: linux-m68k@lists.linux-m68k.org Link: http://lkml.kernel.org/r/1315742394-16036-2-git-send-email-geert@linux-m68k.org Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d5a3009da71a..dc5114b4c16c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc) desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) desc->irq_data.chip->irq_shutdown(&desc->irq_data); - if (desc->irq_data.chip->irq_disable) + else if (desc->irq_data.chip->irq_disable) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); -- cgit v1.2.3 From fa2563e41c3d6d6e8af437643981ed28ae0cb56d Mon Sep 17 00:00:00 2001 From: Thomas Tuttle Date: Wed, 14 Sep 2011 16:22:28 -0700 Subject: workqueue: lock cwq access in drain_workqueue Take cwq->gcwq->lock to avoid racing between drain_workqueue checking to make sure the workqueues are empty and cwq_dec_nr_in_flight decrementing and then incrementing nr_active when it activates a delayed work. We discovered this when a corner case in one of our drivers resulted in us trying to destroy a workqueue in which the remaining work would always requeue itself again in the same workqueue. We would hit this race condition and trip the BUG_ON on workqueue.c:3080. Signed-off-by: Thomas Tuttle Acked-by: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 25fb1b0e53fa..1783aabc6128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2412,8 +2412,13 @@ reflush: for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + bool drained; - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + spin_lock_irq(&cwq->gcwq->lock); + drained = !cwq->nr_active && list_empty(&cwq->delayed_works); + spin_unlock_irq(&cwq->gcwq->lock); + + if (drained) continue; if (++flush_cnt == 10 || -- cgit v1.2.3 From 3be209a8e22cedafc1b6945608b7bb8d9887ab61 Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Mon, 12 Sep 2011 09:28:04 -0500 Subject: sched/rt: Migrate equal priority tasks to available CPUs Commit 43fa5460fe60dea5c610490a1d263415419c60f6 ("sched: Try not to migrate higher priority RT tasks") also introduced a change in behavior which keeps RT tasks on the same CPU if there is an equal priority RT task currently running even if there are empty CPUs available. This can cause unnecessary wakeup latencies, and can prevent the scheduler from balancing all RT tasks across available CPUs. This change causes an RT task to search for a new CPU if an equal priority RT task is already running on wakeup. Lower priority tasks will still have to wait on higher priority tasks, but the system should still balance out because there is always the possibility that if there are both a high and low priority RT tasks on a given CPU that the high priority task could wakeup while the low priority task is running and force it to search for a better runqueue. Signed-off-by: Shawn Bohrer Acked-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: stable@kernel.org # 37+ Link: http://lkml.kernel.org/r/1315837684-18733-1-git-send-email-sbohrer@rgmadvisors.com Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 97540f0c9e47..af1177858be3 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1050,7 +1050,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->rt.nr_cpus_allowed < 2 || - curr->prio < p->prio) && + curr->prio <= p->prio) && (p->rt.nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); @@ -1581,7 +1581,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) p->rt.nr_cpus_allowed > 1 && rt_task(rq->curr) && (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio)) + rq->curr->prio <= p->prio)) push_rt_tasks(rq); } -- cgit v1.2.3 From 1a51410abe7d0ee4b1d112780f46df87d3621043 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Sep 2011 17:04:37 -0700 Subject: Make TASKSTATS require root access Ok, this isn't optimal, since it means that 'iotop' needs admin capabilities, and we may have to work on this some more. But at the same time it is very much not acceptable to let anybody just read anybody elses IO statistics quite at this level. Use of the GENL_ADMIN_PERM suggested by Johannes Berg as an alternative to checking the capabilities by hand. Reported-by: Vasiliy Kulikov Cc: Johannes Berg Acked-by: Balbir Singh Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e19ce1454ee1..e66046456f4f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, }; static struct genl_ops cgroupstats_ops = { -- cgit v1.2.3 From 58c3c3aa01b455ecb99d61ce73f1444274af696b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Sep 2011 17:10:57 -0700 Subject: Make taskstats round statistics down to nearest 1k bytes/events Even with just the interface limited to admin, there really is little to reason to give byte-per-byte counts for taskstats. So round it down to something less intrusive. Acked-by: Balbir Singh Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 24dc60d9fa1f..5bbfac85866e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) #define KB 1024 #define MB (1024*KB) +#define KB_MASK (~(KB-1)) /* * fill in extended accounting fields */ @@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->ioac.rchar; - stats->write_char = p->ioac.wchar; - stats->read_syscalls = p->ioac.syscr; - stats->write_syscalls = p->ioac.syscw; + stats->read_char = p->ioac.rchar & KB_MASK; + stats->write_char = p->ioac.wchar & KB_MASK; + stats->read_syscalls = p->ioac.syscr & KB_MASK; + stats->write_syscalls = p->ioac.syscw & KB_MASK; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; + stats->read_bytes = p->ioac.read_bytes & KB_MASK; + stats->write_bytes = p->ioac.write_bytes & KB_MASK; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit v1.2.3 From eef24afb28561a5a9f4be8f8da97735b7e6a826f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Sep 2011 11:31:37 -0500 Subject: irq: Fix check for already initialized irq_domain in irq_domain_add The sanity check in irq_domain_add() tests desc->irq_data != NULL or irq_data->domain != NULL. This prevents adding an irq_domain to a irq descriptor when irq_data exists, which true when the irq descriptor exists. This went unnoticed so far as the simple domain code did not enter this code path because domain->nr_irqs is always 0 for the simple domains. Split the check for irq_data == NULL out and have a separate warning for it. [ tglx: Made the check for irq_data == NULL separate ] Signed-off-by: Rob Herring Cc: Grant Likely Cc: marc.zyngier@arm.com Cc: thomas.abraham@linaro.org Cc: jamie@jamieiles.com Cc: b-cousson@ti.com Cc: shawn.guo@linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: devicetree-discuss@lists.ozlabs.org Link: http://lkml.kernel.org/r/1316017900-19918-3-git-send-email-robherring2@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d5828da3fd38..b57a3776de44 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -29,7 +29,11 @@ void irq_domain_add(struct irq_domain *domain) */ for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); - if (d || d->domain) { + if (!d) { + WARN(1, "error: assigning domain to non existant irq_desc"); + return; + } + if (d->domain) { /* things are broken; just report, don't clean up */ WARN(1, "error: irq_desc already assigned to a domain"); return; -- cgit v1.2.3 From f9d81f61c84aca693bc353dfef4b8c36c2e5e1b5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Sep 2011 19:46:22 +0200 Subject: ptrace: PTRACE_LISTEN forgets to unlock ->siglock If PTRACE_LISTEN fails after lock_task_sighand() it doesn't drop ->siglock. Reported-by: Matt Fleming Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9de3ecfd20f9..a70d2a5d8c7b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request, break; si = child->last_siginfo; - if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) - break; - - child->jobctl |= JOBCTL_LISTENING; - - /* - * If NOTIFY is set, it means event happened between start - * of this trap and now. Trigger re-trap immediately. - */ - if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); - + if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { + child->jobctl |= JOBCTL_LISTENING; + /* + * If NOTIFY is set, it means event happened between + * start of this trap and now. Trigger re-trap. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); + ret = 0; + } unlock_task_sighand(child, &flags); - ret = 0; break; case PTRACE_DETACH: /* detach a process that was attached. */ -- cgit v1.2.3 From 6ebbe7a07b3bc40b168d2afc569a6543c020d2e3 Mon Sep 17 00:00:00 2001 From: Simon Kirby Date: Thu, 22 Sep 2011 17:03:46 -0700 Subject: sched: Fix up wchan borkage Commit c259e01a1ec ("sched: Separate the scheduler entry for preemption") contained a boo-boo wrecking wchan output. It forgot to put the new schedule() function in the __sched section and thereby doesn't get properly ignored for things like wchan. Tested-by: Simon Kirby Cc: stable@kernel.org # 2.6.39+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110923000346.GA25425@hostway.ca Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec5f472bc5b9..d249ea88428c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4372,7 +4372,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; -- cgit v1.2.3 From 47ea91b4052d9e94b9dca5d7a3d947fbebd07ba9 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 22 Sep 2011 15:48:58 +0800 Subject: Resource: fix wrong resource window calculation __find_resource() incorrectly returns a resource window which overlaps an existing allocated window. This happens when the parent's resource-window spans 0x00000000 to 0xffffffff and is entirely allocated to all its children resource-windows. __find_resource() looks for gaps in resource allocation among the children resource windows. When it encounters the last child window it blindly tries the range next to one allocated to the last child. Since the last child's window ends at 0xffffffff the calculation overflows, leading the algorithm to believe that any window in the range 0x0000000 to 0xfffffff is available for allocation. This leads to a conflicting window allocation. Michal Ludvig reported this issue seen on his platform. The following patch fixes the problem and has been verified by Michal. I believe this bug has been there for ages. It got exposed by git commit 2bbc6942273b ("PCI : ability to relocate assigned pci-resources") Signed-off-by: Ram Pai Tested-by: Michal Ludvig Signed-off-by: Linus Torvalds --- kernel/resource.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3b3cedc52592..c8dc249da5ce 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old, else tmp.end = root->end; + if (tmp.end < tmp.start) + goto next; + resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); @@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old, return 0; } } - if (!this) + +next: if (!this || this->end == root->end) break; + if (this != old) tmp.start = this->end + 1; this = this->sibling; -- cgit v1.2.3 From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Sep 2011 12:42:04 +0200 Subject: posix-cpu-timers: Cure SMP wobbles David reported: Attached below is a watered-down version of rt/tst-cpuclock2.c from GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or similar. Run it several times, and you will see cases where the main thread will measure a process clock difference before and after the nanosleep which is smaller than the cpu-burner thread's individual thread clock difference. This doesn't make any sense since the cpu-burner thread is part of the top-level process's thread group. I've reproduced this on both x86-64 and sparc64 (using both 32-bit and 64-bit binaries). For example: [davem@boricha build-x86_64-linux]$ ./test process: before(0.001221967) after(0.498624371) diff(497402404) thread: before(0.000081692) after(0.498316431) diff(498234739) self: before(0.001223521) after(0.001240219) diff(16698) [davem@boricha build-x86_64-linux]$ The diff of 'process' should always be >= the diff of 'thread'. I make sure to wrap the 'thread' clock measurements the most tightly around the nanosleep() call, and that the 'process' clock measurements are the outer-most ones. --- #include #include #include #include #include #include #include #include static pthread_barrier_t barrier; static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) __asm__ __volatile__("" : : : "memory"); return NULL; } int main(void) { clockid_t process_clock, my_thread_clock, th_clock; struct timespec process_before, process_after; struct timespec me_before, me_after; struct timespec th_before, th_after; struct timespec sleeptime; unsigned long diff; pthread_t th; int err; err = clock_getcpuclockid(0, &process_clock); if (err) return 1; err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); if (err) return 1; pthread_barrier_init(&barrier, NULL, 2); err = pthread_create(&th, NULL, chew_cpu, NULL); if (err) return 1; err = pthread_getcpuclockid(th, &th_clock); if (err) return 1; pthread_barrier_wait(&barrier); err = clock_gettime(process_clock, &process_before); if (err) return 1; err = clock_gettime(my_thread_clock, &me_before); if (err) return 1; err = clock_gettime(th_clock, &th_before); if (err) return 1; sleeptime.tv_sec = 0; sleeptime.tv_nsec = 500000000; nanosleep(&sleeptime, NULL); err = clock_gettime(th_clock, &th_after); if (err) return 1; err = clock_gettime(my_thread_clock, &me_after); if (err) return 1; err = clock_gettime(process_clock, &process_after); if (err) return 1; diff = process_after.tv_nsec - process_before.tv_nsec; printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", process_before.tv_sec, process_before.tv_nsec, process_after.tv_sec, process_after.tv_nsec, diff); diff = th_after.tv_nsec - th_before.tv_nsec; printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", th_before.tv_sec, th_before.tv_nsec, th_after.tv_sec, th_after.tv_nsec, diff); diff = me_after.tv_nsec - me_before.tv_nsec; printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", me_before.tv_sec, me_before.tv_nsec, me_after.tv_sec, me_after.tv_nsec, diff); return 0; } This is due to us using p->se.sum_exec_runtime in thread_group_cputime() where we iterate the thread group and sum all data. This does not take time since the last schedule operation (tick or otherwise) into account. We can cure this by using task_sched_runtime() at the cost of having to take locks. This also means we can (and must) do away with thread_group_sched_runtime() since the modified thread_group_cputime() is now more accurate and would deadlock when called from thread_group_sched_runtime(). Aside of that it makes the function safe on 32 bit systems. The old code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a 64bit value and could be changed on another cpu at the same time. Reported-by: David Miller Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins Tested-by: David Miller Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 5 +++-- kernel/sched.c | 24 ------------------------ 2 files changed, 3 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..c8008dd58ef2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); - times->sum_exec_runtime += t->se.sum_exec_runtime; + times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); @@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + thread_group_cputime(p, &cputime); + cpu->sched = cputime.sum_exec_runtime; break; } return 0; diff --git a/kernel/sched.c b/kernel/sched.c index d249ea88428c..b50b0f0c9aa9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3724,30 +3724,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -/* - * Return sum_exec_runtime for the thread group. - * In case the task is currently running, return the sum plus current's - * pending runtime that have not been accounted yet. - * - * Note that the thread group might have other running tasks as well, - * so the return value not includes other pending runtime that other - * running tasks might have. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - struct task_cputime totals; - unsigned long flags; - struct rq *rq; - u64 ns; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to -- cgit v1.2.3