From e154ff3d2c5ad313ef0c66e6217502361cad2799 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 10 Jul 2006 04:44:32 -0700
Subject: [PATCH] adjust clock for lost ticks

A large number of lost ticks can cause an overadjustment of the clock.  To
compensate for this we look at the current error and the larger the error
already is the more careful we are at adjusting the error.  As small extra
fix reset the error when the clock is set.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Cc: Uwe Bugla <uwe.bugla@gmx.de>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 85 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 38 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 396a3c024c2c..2a87430a58d4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -891,6 +891,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
+	clock->error = 0;
 	ntp_clear();
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -1008,52 +1009,52 @@ static int __init timekeeping_init_device(void)
 device_initcall(timekeeping_init_device);
 
 /*
- * If the error is already larger, we look ahead another tick,
+ * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
 {
-	int adj;
+	s64 tick_error, i;
+	u32 look_ahead, adj;
+	s32 error2, mult;
 
 	/*
-	 * As soon as the machine is synchronized to the external time
-	 * source this should be the common case.
+	 * Use the current error value to determine how much to look ahead.
+	 * The larger the error the slower we adjust for it to avoid problems
+	 * with losing too many ticks, otherwise we would overadjust and
+	 * produce an even larger error.  The smaller the adjustment the
+	 * faster we try to adjust for it, as lost ticks can do less harm
+	 * here.  This is tuned so that an error of about 1 msec is adusted
+	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error >>= 2;
-	if (likely(sign > 0 ? error <= *interval : error >= *interval))
-		return sign;
+	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = abs(error2);
+	for (look_ahead = 0; error2 > 0; look_ahead++)
+		error2 >>= 2;
 
 	/*
-	 * An extra look ahead dampens the effect of the current error,
-	 * which can grow quite large with continously late updates, as
-	 * it would dominate the adjustment value and can lead to
-	 * oscillation.
+	 * Now calculate the error in (1 << look_ahead) ticks, but first
+	 * remove the single look ahead already included in the error.
 	 */
-	error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
-	error -= clock->xtime_interval >> 1;
-
-	adj = 0;
-	while (1) {
-		error >>= 1;
-		if (sign > 0 ? error <= *interval : error >= *interval)
-			break;
-		adj++;
+	tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+	tick_error -= clock->xtime_interval >> 1;
+	error = ((error - tick_error) >> look_ahead) + tick_error;
+
+	/* Finally calculate the adjustment shift value.  */
+	i = *interval;
+	mult = 1;
+	if (error < 0) {
+		error = -error;
+		*interval = -*interval;
+		*offset = -*offset;
+		mult = -1;
 	}
-
-	/*
-	 * Add the current adjustments to the error and take the offset
-	 * into account, the latter can cause the error to be hardly
-	 * reduced at the next tick. Check the error again if there's
-	 * room for another adjustment, thus further reducing the error
-	 * which otherwise had to be corrected at the next update.
-	 */
-	error = (error << 1) - *interval + *offset;
-	if (sign > 0 ? error > *interval : error < *interval)
-		adj++;
+	for (adj = 0; error > i; adj++)
+		error >>= 1;
 
 	*interval <<= adj;
 	*offset <<= adj;
-	return sign << adj;
+	return mult << adj;
 }
 
 /*
@@ -1068,11 +1069,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 
 	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
 	if (error > interval) {
-		adj = clocksource_bigadjust(1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error <= interval))
+			adj = 1;
+		else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else if (error < -interval) {
-		interval = -interval;
-		offset = -offset;
-		adj = clocksource_bigadjust(-1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error >= -interval)) {
+			adj = -1;
+			interval = -interval;
+			offset = -offset;
+		} else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else
 		return;
 
@@ -1129,7 +1138,7 @@ static void update_wall_time(void)
 	clocksource_adjust(clock, offset);
 
 	/* store full nanoseconds into xtime */
-	xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
 	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
 
 	/* check to see if there is a new clocksource to use */
-- 
cgit v1.2.3


From a0009652af385a42f0e0604136f772ead406c78d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 14 Jul 2006 00:24:06 -0700
Subject: [PATCH] del_timer_sync(): add cpu_relax()

Relax the CPU in the del_timer_sync() busywait loop.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 2a87430a58d4..acfa557e685b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		cpu_relax();
 	}
 }
 
-- 
cgit v1.2.3


From 3e143475c22036847f898d7e76ba337c1d7dbf6f Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 14 Jul 2006 00:24:17 -0700
Subject: [PATCH] improve timekeeping resume robustness

Resolve problems seen w/ APM suspend.

Due to resume initialization ordering, its possible we could get a timer
interrupt before the timekeeping resume() function is called.  This patch
ensures we don't do any timekeeping accounting before we're fully resumed.

(akpm: fixes the machine-freezes-on-APM-resume bug)

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index acfa557e685b..05809c2e2fd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -969,6 +969,7 @@ void __init timekeeping_init(void)
 }
 
 
+static int timekeeping_suspended;
 /*
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
@@ -984,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
+	clock->error = 0;
+	timekeeping_suspended = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -991,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
 	.resume		= timekeeping_resume,
+	.suspend	= timekeeping_suspend,
 	set_kset_name("timekeeping"),
 };
 
@@ -1101,13 +1115,16 @@ static void update_wall_time(void)
 {
 	cycle_t offset;
 
-	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+	/* Make sure we're fully resumed: */
+	if (unlikely(timekeeping_suspended))
+		return;
 
 #ifdef CONFIG_GENERIC_TIME
 	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
 #else
 	offset = clock->cycle_interval;
 #endif
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
-- 
cgit v1.2.3


From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Sun, 30 Jul 2006 03:03:35 -0700
Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu
 notifications

Few of the callback functions and notifier blocks that are associated with cpu
notifications incorrectly have __devinit and __devinitdata.  They should be
__cpuinit and __cpuinitdata instead.

It makes no functional difference but wastes text area when CONFIG_HOTPLUG is
enabled and CONFIG_HOTPLUG_CPU is not.

This patch fixes all those instances.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 05809c2e2fd6..ee319fc69f40 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1688,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1708,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
-- 
cgit v1.2.3


From 6ea24f9ad18c65cc179593b5cc2a88cdadf8cc0c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 30 Jul 2006 03:03:38 -0700
Subject: [PATCH] fix bad macro param in timer.c

We have

#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK

and it's used via

	list = varray[i + 1]->vec + (INDEX(i + 1));

So, due to underparenthesisation, this INDEX(i+1) is now a ...  (TVR_BITS + i
+ 1 * TVN_BITS)) ...

So this bugfix changes behaviour.  It worked before by sheer luck:

  "If i was anything but 0, it was broken.  But this was only used by
   s390 and arm.  Since it was for the next interrupt, could that next
   interrupt be a problem (going into the second cascade)? But it was
   probably seldom wrong.  That is, this would fail if the next
   interrupt was in the second cascade, and was wrapped.  Which may
   never of happened.  Also if it did happen, it would have just missed
   the interrupt.

   If an interrupt was missed, and no one was there to miss it, was it
   really missed :-)"

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index ee319fc69f40..726caf9a0a69 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 static inline void __run_timers(tvec_base_t *base)
 {
-- 
cgit v1.2.3


From 51d8c5edd3b166fcc51aba84d78761d578400a7c Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Sun, 30 Jul 2006 03:04:14 -0700
Subject: [PATCH] timer: Fix tvec_bases initializer

kernel/timer.c defines a (per-cpu) pointer to tvec_base_t, but initializes
it using { &a_tvec_base_t }, which sparse warns about; change this to just
&a_tvec_base_t.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 726caf9a0a69..b650f04888ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
-- 
cgit v1.2.3


From 6997a6faaa129a1c91775f7344c8d371a05178ea Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Sun, 13 Aug 2006 23:24:23 -0700
Subject: [PATCH] sys_getppid oopses on debug kernel

sys_getppid() optimization can access a freed memory.  On kernels with
DEBUG_SLAB turned ON, this results in Oops.  As Dave Hansen noted, this
optimization is also unsafe for memory hotplug.

So this patch always takes the lock to be safe.

[oleg@tv-sign.ru: simplifications]
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: <stable@kernel.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/timer.c | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index b650f04888ed..1d7dd6267c2d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1324,46 +1324,19 @@ asmlinkage long sys_getpid(void)
 }
 
 /*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
  */
 asmlinkage long sys_getppid(void)
 {
 	int pid;
-	struct task_struct *me = current;
-	struct task_struct *parent;
 
-	parent = me->group_leader->real_parent;
-	for (;;) {
-		pid = parent->tgid;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
-		struct task_struct *old = parent;
+	rcu_read_lock();
+	pid = rcu_dereference(current->real_parent)->tgid;
+	rcu_read_unlock();
 
-		/*
-		 * Make sure we read the pid before re-reading the
-		 * parent pointer:
-		 */
-		smp_rmb();
-		parent = me->group_leader->real_parent;
-		if (old != parent)
-			continue;
-}
-#endif
-		break;
-	}
 	return pid;
 }
 
-- 
cgit v1.2.3