From 06fed33849c13af637c4d09e9ba27828fac9edd5 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 15 Feb 2006 15:17:38 -0800 Subject: [PATCH] cpuset: oops in exit on null cpuset fix Fix a latent bug in cpuset_exit() handling. If a task tried to allocate memory after calling cpuset_exit(), it oops'd in cpuset_update_task_memory_state() on a NULL cpuset pointer. So set the exiting tasks cpuset to the root cpuset instead of to NULL. A distro kernel hit this with an added kernel package that had just such a hook (allocating memory) in the exit code path. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba42b0a76961..12815d3f1a05 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child) * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. + * + * Hack: + * + * Set the exiting tasks cpuset to the root cpuset (top_cpuset). + * + * Don't leave a task unable to allocate memory, as that is an + * accident waiting to happen should someone add a callout in + * do_exit() after the cpuset_exit() call that might allocate. + * If a task tries to allocate memory with an invalid cpuset, + * it will oops in cpuset_update_task_memory_state(). + * + * We call cpuset_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to + * the root cpuset (top_cpuset) for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cpuset, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cpuset function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cpuset reference count, to no avail. + * + * Normally, holding a reference to a cpuset without bumping its + * count is unsafe. The cpuset could go away, or someone could + * attach us to a different cpuset, decrementing the count on + * the first cpuset that we never incremented. But in this case, + * top_cpuset isn't going away, and either task has PF_EXITING set, + * which wards off any attach_task() attempts, or task is a failed + * fork, never visible to attach_task. + * + * Another way to do this would be to set the cpuset pointer + * to NULL here, and check in cpuset_update_task_memory_state() + * for a NULL pointer. This hack avoids that NULL check, for no + * cost (other than this way too long comment ;). **/ void cpuset_exit(struct task_struct *tsk) @@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = NULL; + tsk->cpuset = &top_cpuset; /* Hack - see comment above */ if (notify_on_release(cs)) { char *pathbuf = NULL; -- cgit v1.2.3 From c8adb494a6df6b2be8e50a8dafd5bab231df3505 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Feb 2006 15:17:42 -0800 Subject: [PATCH] swsusp: nuke noisy message I get about 88 squillion of these when suspending an old ad450nx server. Cc: Pavel Roskin Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 41f66365f0d8..8d5a5986d621 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone) * corrected eventually when the cases giving rise to this * are better understood. */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); + if (PageReserved(page)) continue; - } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; -- cgit v1.2.3 From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 16 Feb 2006 23:41:58 +0100 Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and cleanup AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution installation it's easiest if it's a boot time option. Also I moved the variable to a more appropiate place and make it independent from sysctl And marked __read_mostly which it is. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71dd6f62efec..7654d55c47f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,8 +126,6 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif -int randomize_va_space = 1; - static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 17 Feb 2006 10:30:23 +1100 Subject: [PATCH] Provide an interface for getting the current tick length This provides an interface for arch code to find out how many nanoseconds are going to be added on to xtime by the next call to do_timer. The value returned is a fixed-point number in 52.12 format in nanoseconds. The reason for this format is that it gives the full precision that the timekeeping code is using internally. The motivation for this is to fix a problem that has arisen on 32-bit powerpc in that the value returned by do_gettimeofday drifts apart from xtime if NTP is being used. PowerPC is now using a lockless do_gettimeofday based on reading the timebase register and performing some simple arithmetic. (This method of getting the time is also exported to userspace via the VDSO.) However, the factor and offset it uses were calculated based on the nominal tick length and weren't being adjusted when NTP varied the tick length. Note that 64-bit powerpc has had the lockless do_gettimeofday for a long time now. It also had an extremely hairy routine that got called from the 32-bit compat routine for adjtimex, which adjusted the factor and offset according to what it thought the timekeeping code was going to do. Not only was this only called if a 32-bit task did adjtimex (i.e. not if a 64-bit task did adjtimex), it was also duplicating computations from kernel/timer.c and it wasn't clear that it was (still) correct. The simple solution is to ask the timekeeping code how long the current jiffy will be on each timer interrupt, after calling do_timer. If this jiffy will be a different length from the last one, we then need to compute new values for the factor and offset used in the lockless do_gettimeofday. In this way we can keep xtime and do_gettimeofday in sync, even when NTP is varying the tick length. Note that when adjtimex varies the tick length, it almost always introduces the variation from the next tick on. The only case I could see where adjtimex would vary the length of the current tick is when an old-style adjtime adjustment is being cancelled. (It's not clear to me why the adjustment has to be cancelled immediately rather than from the next tick on.) Thus I don't see any real need for a hook in adjtimex; the rare case of an old-style adjustment being cancelled can be fixed up at the next tick. Signed-off-by: Paul Mackerras Acked-by: john stultz Signed-off-by: Linus Torvalds --- kernel/timer.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index b9dad3994676..fe3a9a9f8328 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -717,12 +717,16 @@ static void second_overflow(void) #endif } -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +/* + * Returns how many microseconds we need to add to xtime this tick + * in doing an adjustment requested with adjtime. + */ +static long adjtime_adjustment(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; - if ((time_adjust_step = time_adjust) != 0 ) { + time_adjust_step = time_adjust; + if (time_adjust_step) { /* * We are doing an adjtime thing. Prepare time_adjust_step to * be within bounds. Note that a positive time_adjust means we @@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void) */ time_adjust_step = min(time_adjust_step, (long)tickadj); time_adjust_step = max(time_adjust_step, (long)-tickadj); + } + return time_adjust_step; +} +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + long time_adjust_step, delta_nsec; + + time_adjust_step = adjtime_adjustment(); + if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - } delta_nsec = tick_nsec + time_adjust_step * 1000; /* * Advance the phase, once it gets to one microsecond, then @@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void) } } +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 + * bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + long delta_nsec; + + delta_nsec = tick_nsec + adjtime_adjustment() * 1000; + return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; +} + /* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, -- cgit v1.2.3 From 4bbf39c29bc3409d6454faf0dfa1b3b0aa2ac2af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Feb 2006 13:52:44 -0800 Subject: [PATCH] Introduce CONFIG_DEFAULT_MIGRATION_COST Heiko Carstens wrote: The boot sequence on s390 sometimes takes ages and we spend a very long time (up to one or two minutes) in calibrate_migration_costs. The time spent there differs from boot to boot. Also the calculated costs differ a lot. I've seen differences by up to a factor of 15 (yes, factor not percent). Also I doubt that making these measurements make much sense on a completely virtualized architecture where you cannot tell how much cpu time you will get anyway. So introduce the CONFIG_DEFAULT_MIGRATION_COST method for an architecture to set the scheduler migration costs. This turns off automatic detection of migration costs. Makes sense on virtual platforms, where migration costs are hard to measure accurately. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 66d957227de9..12d291bf3379 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5058,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, #define MAX_DOMAIN_DISTANCE 32 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = +/* + * Architectures may override the migration cost and thus avoid + * boot-time calibration. Unit is nanoseconds. Mostly useful for + * virtualized hardware: + */ +#ifdef CONFIG_DEFAULT_MIGRATION_COST + CONFIG_DEFAULT_MIGRATION_COST +#else + -1LL +#endif +}; /* * Allow override of migration cost - in units of microseconds. -- cgit v1.2.3 From a8534adb74e23374889b84b3d97eb18da542a1b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Feb 2006 13:52:51 -0800 Subject: [PATCH] swsusp: fix breakage with swap on LVM Restore the compatibility with the older code and make it possible to suspend if the kernel command line doesn't contain the "resume=" argument Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 4e90905f0e87..2d9d08f72f76 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */ { int i; - if (!swsusp_resume_device) - return -ENODEV; spin_lock(&swap_lock); for (i = 0; i < MAX_SWAPFILES; i++) { if (!(swap_info[i].flags & SWP_WRITEOK)) continue; - if (is_resume_device(swap_info + i)) { + if (!swsusp_resume_device || is_resume_device(swap_info + i)) { spin_unlock(&swap_lock); root_swap = i; return 0; -- cgit v1.2.3