From 256e2fdf033f5c8b5093cd817d44cea3a11a4e6f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Aug 2007 23:47:45 +0400 Subject: Fix Off-by-one in /sys/module/*/refcnt sysfs internals were changed to not pin module in question. Signed-off-by: Alexey Dobriyan Acked-by: Kay Sievers Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 33c04ad51175..db0ead0363e2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module *mod, char *buffer) { - /* sysfs holds a reference */ - return sprintf(buffer, "%u\n", module_refcount(mod)-1); + return sprintf(buffer, "%u\n", module_refcount(mod)); } static struct module_attribute refcnt = { -- cgit v1.2.3 From 2aa44d0567ed21b47b87d68819415d48194cb923 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Aug 2007 15:18:02 +0200 Subject: sched: sched_clock_idle_[sleep|wakeup]_event() construct a more or less wall-clock time out of sched_clock(), by using ACPI-idle's existing knowledge about how much time we spent idling. This allows the rq clock to work around TSC-stops-in-C2, TSC-gets-corrupted-in-C3 type of problems. ( Besides the scheduler's statistics this also benefits blktrace and printk-timestamps as well. ) Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup callbacks allow the scheduler to get out the most of the period where the CPU has a reliable TSC. This results in slightly more precise task statistics. the ACPI bits were acked by Len. Signed-off-by: Ingo Molnar Acked-by: Len Brown --- kernel/sched.c | 41 ++++++++++++++++++++++++++++++++--------- kernel/sched_debug.c | 3 ++- 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f1..48e7586168ef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,7 +262,8 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; + u64 idle_clock; + unsigned int clock_deep_idle_events; u64 tick_timestamp; atomic_t nr_iowait; @@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); } +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b85..ab18f45f2ab2 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(next_balance); P(curr->pid); P(clock); + P(idle_clock); P(prev_clock_raw); P(clock_warps); P(clock_overflows); - P(clock_unstable_events); + P(clock_deep_idle_events); P(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); -- cgit v1.2.3 From c57baf1e1e24b004b57d282267542baab802753c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 23 Aug 2007 15:18:02 +0200 Subject: sched: fix sysctl directory permissions There are two remaining gotchas: - The directories have impossible permissions (writeable). - The ctl_name for the kernel directory is inconsistent with everything else. It should be CTL_KERN. Signed-off-by: Eric W. Biederman Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 48e7586168ef..5fecbbba12ac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5257,15 +5257,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static struct ctl_table sd_ctl_dir[] = { { .procname = "sched_domain", - .mode = 0755, + .mode = 0555, }, {0,}, }; static struct ctl_table sd_ctl_root[] = { { + .ctl_name = CTL_KERN, .procname = "kernel", - .mode = 0755, + .mode = 0555, .child = sd_ctl_dir, }, {0,}, @@ -5341,7 +5342,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) for_each_domain(cpu, sd) { snprintf(buf, 32, "domain%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_domain_table(sd); entry++; i++; @@ -5361,7 +5362,7 @@ static void init_sched_domain_sysctl(void) for (i = 0; i < cpu_num; i++, entry++) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); } sd_sysctl_header = register_sysctl_table(sd_ctl_root); -- cgit v1.2.3 From f8700df7c419781efb34696de7e7f49717f8ede7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Aug 2007 15:18:02 +0200 Subject: sched: fix broken SMT/MC optimizations On a four package system with HT - HT load balancing optimizations were broken. For example, if two tasks end up running on two logical threads of one of the packages, scheduler is not able to pull one of the tasks to a completely idle package. In this scenario, for nice-0 tasks, imbalance calculated by scheduler will be 512 and find_busiest_queue() will return 0 (as each cpu's load is 1024 > imbalance and has only one task running). Similarly MC scheduler optimizations also get fixed with this patch. [ mingo@elte.hu: restored fair balancing by increasing the fuzz and adding it back to the power decision, without the /2 factor. ] Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5fecbbba12ac..d96030db8ff7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2517,7 +2517,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; -- cgit v1.2.3 From f549da848eca595abca14ebc5e1bf00fd72aa53d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Aug 2007 15:18:02 +0200 Subject: sched: skip updating rq's next_balance under null SD Was playing with sched_smt_power_savings/sched_mc_power_savings and found out that while the scheduler domains are reconstructed when sysfs settings change, rebalance_domains() can get triggered with null domain on other cpus, which is setting next_balance to jiffies + 60*HZ. Resulting in no idle/busy balancing for 60 seconds. Fix this. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d96030db8ff7..a4b22d93e00d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3043,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3079,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3090,7 +3093,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* -- cgit v1.2.3 From 505c0efd58031923ae01deac16d896607cafa70e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Aug 2007 15:18:02 +0200 Subject: sched: tweak the sched_runtime_limit tunable Michael Gerdau reported reniced task CPU usage weirdnesses. Such symptoms can be caused by limit underruns so double the sched_runtime_limit. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a4b22d93e00d..96e9b82246d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4923,7 +4923,7 @@ static inline void sched_init_granularity(void) if (sysctl_sched_granularity > gran_limit) sysctl_sched_granularity = gran_limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; + sysctl_sched_runtime_limit = sysctl_sched_granularity * 8; sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; } -- cgit v1.2.3 From 7c6c16f354cde4a48bd305b2587fc78257bcb936 Mon Sep 17 00:00:00 2001 From: Bruce Ashfield Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: CONFIG_SCHED_GROUP_FAIR=y fixlet when I built with CONFIG_FAIR_GROUP_SCHED=y, I need the following change to make things right. [ From: mingo@elte.hu ] this config option is not upstream-configurable right now but lets fix this for completeness. Signed-off-by: Bruce Ashfield Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fedbb51bba96..b5270dc98bef 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1057,7 +1057,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) */ static void set_curr_task_fair(struct rq *rq) { - struct sched_entity *se = &rq->curr.se; + struct sched_entity *se = &rq->curr->se; for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); -- cgit v1.2.3 From 71fd37146385c8255bfd370f33ca81fe8c81e5a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: remove HZ dependency from the granularity default remove HZ dependency from the granularity default. Use 10 msec for the base granularity, 1 msec for wakeup granularity and 25 msec for batch wakeup granularity. (These defaults are close to the values that the default HZ=250 setting got previously, and thus it's the most common setting.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 96e9b82246d2..e95ff22ed174 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4923,7 +4923,7 @@ static inline void sched_init_granularity(void) if (sysctl_sched_granularity > gran_limit) sysctl_sched_granularity = gran_limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 8; + sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b5270dc98bef..6b0974c3fb67 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -19,7 +19,7 @@ /* * Preemption granularity: - * (default: 2 msec, units: nanoseconds) + * (default: 10 msec, units: nanoseconds) * * NOTE: this granularity value is not the same as the concept of * 'timeslice length' - timeslices in CFS will typically be somewhat @@ -31,18 +31,17 @@ * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; +unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 25 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = - 10000000000ULL/HZ; +unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; /* * SCHED_OTHER wake-up granularity. @@ -52,12 +51,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; unsigned int sysctl_sched_stat_granularity __read_mostly; /* - * Initialized in sched_init_granularity(): + * Initialized in sched_init_granularity() [to 5 times the base granularity]: */ unsigned int sysctl_sched_runtime_limit __read_mostly; -- cgit v1.2.3 From deac4ee65af4befb66b542e4a782e63da93b51a0 Mon Sep 17 00:00:00 2001 From: Sven-Thorsten Dietrich Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: simplify can_migrate_task() Remove trivial conditional branch in Linux scheduler's can_migrate_task() function. text data bss dec hex filename 34770 2998 24 37792 93a0 sched.o.before 34757 2998 24 37779 9393 sched.o.after Signed-off-by: Sven-Thorsten Dietrich Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e95ff22ed174..6798328a2e0e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2180,12 +2180,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if too many balance attempts have failed: - */ - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; - return 1; } -- cgit v1.2.3 From 98fbc798533339be802c6dcd48c2293c712e87db Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: optimize task_tick_rt() a bit Mitchell Erblich suggested a quality-of-implementation change to not requeue SCHED_RR tasks if there's only a single task on the runqueue, by checking for rq->nr_running == 1. provide a more efficient implementation of that, to check that particular RT priority-queue only. [ From: mingo@elte.hu ] Also first requeue the task then set need_resched - results in slightly better machine-instruction ordering. Also clean up the code a bit. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index dcdcad632fd9..4b87476a02d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) return; p->time_slice = static_prio_timeslice(p->static_prio); - set_tsk_need_resched(p); - /* put it at the end of the queue: */ - requeue_task_rt(rq, p); + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->run_list.prev != p->run_list.next) { + requeue_task_rt(rq, p); + set_tsk_need_resched(p); + } } static struct sched_class rt_sched_class __read_mostly = { -- cgit v1.2.3 From b2133c8b1e270b4a7c36f70e29be8738d09e850b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: tidy up and simplify the bonus balance make the bonus balance more consistent: do not hand out a bonus if there's too much in flight already, and only deduct as much from a runner as it has the capacity. This makes the bonus engine a zero-sum game (as intended). this also simplifies the code: text data bss dec hex filename 34770 2998 24 37792 93a0 sched.o.before 34749 2998 24 37771 938b sched.o.after and it also avoids overscheduling in sleep-happy workloads like hackbench.c. Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6b0974c3fb67..c578370cd693 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -306,6 +306,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); delta = calc_delta_mine(delta, curr->load.weight, lw); delta = min((u64)delta, cfs_rq->sleeper_bonus); + delta = min(delta, (unsigned long)( + (long)sysctl_sched_runtime_limit - curr->wait_runtime)); cfs_rq->sleeper_bonus -= delta; delta_mine -= delta; } @@ -493,6 +495,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) unsigned long load = cfs_rq->load.weight, delta_fair; long prev_runtime; + /* + * Do not boost sleepers if there's too much bonus 'in flight' + * already: + */ + if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) + return; + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) load = rq_of(cfs_rq)->cpu_load[2]; @@ -512,16 +521,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) prev_runtime = se->wait_runtime; __add_wait_runtime(cfs_rq, se, delta_fair); + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); delta_fair = se->wait_runtime - prev_runtime; /* * Track the amount of bonus we've given to sleepers: */ cfs_rq->sleeper_bonus += delta_fair; - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -- cgit v1.2.3 From a6f2994042cc2db9e507dc702ed0b5e2cc5890fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: simplify bonus calculation #1 current code: delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); delta = calc_delta_mine(delta, curr->load.weight, lw); delta = min((u64)delta, cfs_rq->sleeper_bonus); drop the first min(), because we clip against sleeper_bonus in the 3rd line again. That gives: delta = calc_delta_mine(delta_exec, curr->load.weight, lw); delta = min((u64)delta, cfs_rq->sleeper_bonus); Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c578370cd693..5b2d97fcd80c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -303,8 +303,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { - delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); - delta = calc_delta_mine(delta, curr->load.weight, lw); + delta = calc_delta_mine(delta_exec, curr->load.weight, lw); delta = min((u64)delta, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); -- cgit v1.2.3 From ea0aa3b23a193d1fc5c982286edecd071af67d94 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: simplify bonus calculation #2 current code: delta = calc_delta_mine(delta_exec, curr->load.weight, lw); delta = min((u64)delta, cfs_rq->sleeper_bonus); Notice that this calc_delta_mine() line is exactly delta_mine, which gives: delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5b2d97fcd80c..c078f1af721c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -303,8 +303,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { - delta = calc_delta_mine(delta_exec, curr->load.weight, lw); - delta = min((u64)delta, cfs_rq->sleeper_bonus); + delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); cfs_rq->sleeper_bonus -= delta; -- cgit v1.2.3 From 095e56c7036fe97bc3ebcd80ed6e121be0847656 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Aug 2007 20:39:10 +0200 Subject: sched: fix startup penalty calculation fix task startup penalty miscalculation: sysctl_sched_granularity is unsigned int and wait_runtime is long so we first have to convert it to long before turning it negative ... Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c078f1af721c..4d6b7e2df2aa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1047,7 +1047,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sysctl_sched_granularity / 2); + p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); __enqueue_entity(cfs_rq, se); } -- cgit v1.2.3 From 1fc84aaae3bae9646dd4c7798b8c0ff934338909 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Aug 2007 18:41:52 +0200 Subject: sched: fix CONFIG_SCHED_DEBUG dependency of lockdep sysctls Make the lockdep sysctls not depend on CONFIG_SCHED_DEBUG. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9029690f4fae..ea90ef51085c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -283,6 +283,15 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, @@ -302,15 +311,6 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #endif { .ctl_name = KERN_PANIC, -- cgit v1.2.3 From 218050855ece4e923106ab614ac65afa0f618df3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Aug 2007 18:41:53 +0200 Subject: sched: adaptive scheduler granularity Instead of specifying the preemption granularity, specify the wanted latency. By fixing the granlarity to a constany the wakeup latency it a function of the number of running tasks on the rq. Invert this relation. sysctl_sched_granularity becomes a minimum for the dynamic granularity computed from the new sysctl_sched_latency. Then use this latency to do more intelligent granularity decisions: if there are fewer tasks running then we can schedule coarser. This helps performance while still always keeping the latency target. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++---- kernel/sched_fair.c | 77 ++++++++++++++++++++++++++++++++++++++++++++--------- kernel/sysctl.c | 11 ++++++++ 3 files changed, 85 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6798328a2e0e..da26f46d50d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + if (sysctl_sched_granularity > limit) + sysctl_sched_granularity = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_runtime_limit = sysctl_sched_latency * 5; + sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; } #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4d6b7e2df2aa..0ba1e60f08d0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,23 +15,32 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ /* - * Preemption granularity: - * (default: 10 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: */ -unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) + */ +unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. @@ -212,6 +221,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +/* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_granularity); + } + + return gran; +} + /* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: @@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); @@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * it will preempt the parent: */ p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: @@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea90ef51085c..9e3d2960faf5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -231,6 +231,17 @@ static ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns", -- cgit v1.2.3 From 172ac3dbb7d3e528ac53d08a34df88d1ac53c534 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Aug 2007 18:41:53 +0200 Subject: sched: cleanup, sched_granularity -> sched_min_granularity due to adaptive granularity scheduling the role of sched_granularity has changed to "minimum granularity", so rename the variable (and the tunable) accordingly. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 6 +++--- kernel/sched_fair.c | 4 ++-- kernel/sysctl.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index da26f46d50d7..a40ab657ad19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4913,9 +4913,9 @@ static inline void sched_init_granularity(void) unsigned int factor = 1 + ilog2(num_online_cpus()); const unsigned long limit = 100000000; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > limit) - sysctl_sched_granularity = limit; + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; sysctl_sched_latency *= factor; if (sysctl_sched_latency > limit) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0ba1e60f08d0..ee3771850aaf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -40,7 +40,7 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; +unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. @@ -258,7 +258,7 @@ sched_granularity(struct cfs_rq *cfs_rq) if (nr > 1) { gran = gran/nr - gran/nr/nr; - gran = max(gran, sysctl_sched_granularity); + gran = max(gran, sysctl_sched_min_granularity); } return gran; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9e3d2960faf5..6ace893c17c9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -222,8 +222,8 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, -- cgit v1.2.3 From 50c46637aa894f904e2fb39086a3d7732f68bd50 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Aug 2007 22:17:19 +0200 Subject: sched: s/sched_latency/sched_min_granularity runtime limit and wakeup granularity used to be a function of granularity and that was incorrect changed to sched_latency. Fix this to make wakeup granularity a function of min-granularity, and the runtime limit equal to latency. Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a40ab657ad19..9fe473a190de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4921,8 +4921,8 @@ static inline void sched_init_granularity(void) if (sysctl_sched_latency > limit) sysctl_sched_latency = limit; - sysctl_sched_runtime_limit = sysctl_sched_latency * 5; - sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; } #ifdef CONFIG_SMP -- cgit v1.2.3 From d243769d3f83b318813a04a9592bb7cfedc6c280 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 27 Aug 2007 16:06:19 +0100 Subject: fix bogus hotplug cpu warning Fix bogus DEBUG_PREEMPT warning on x86_64, when cpu brought online after bootup: current_is_keventd is right to note its use of smp_processor_id is preempt-safe, but should use raw_smp_processor_id to avoid the warning. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6bb..e080d1d744cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -635,7 +635,7 @@ int keventd_up(void) int current_is_keventd(void) { struct cpu_workqueue_struct *cwq; - int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ int ret = 0; BUG_ON(!keventd_wq); -- cgit v1.2.3 From 5f01d519e60a6ca1a7d9be9f2d73c5f521383992 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: fix sleeper bonus limit There is an Amarok song switch time increase (regression) under hefty load. What is happening is that sleeper_bonus is never consumed, and only rarely goes below runtime_limit, so for the most part, Amarok isn't getting any bonus at all. We're keeping sleeper_bonus right at runtime_limit (sched_latency == sched_runtime_limit == 40ms) forever, ie we don't consume if we're lower that that, and don't add if we're above it. One Amarok thread waking (or anybody else) will push us past the threshold, so the next thread waking gets nada, but will reap pain from the previous thread waking until we drop back to runtime_limit. It looks to me like under load, some random task gets a bonus, and everybody else pays, whether deserving or not. This diff fixed the regression for me at any load rate. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ee3771850aaf..9f53d49f3aab 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -354,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { + if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); -- cgit v1.2.3 From f6cf891c4d7128f9f91243fc0b9ce99e10fa1586 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: make the scheduler converge to the ideal latency de-HZ-ification of the granularity defaults unearthed a pre-existing property of CFS: while it correctly converges to the granularity goal, it does not prevent run-time fluctuations in the range of [-gran ... 0 ... +gran]. With the increase of the granularity due to the removal of HZ dependencies, this becomes visible in chew-max output (with 5 tasks running): out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 17 . 13 | per: 44 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 36 . 40 out: 29 . 27. 32 | flu: 2 . 0 | ran: 17 . 13 | per: 46 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 29 . 27. 32 | flu: 0 . 0 | ran: 18 . 13 | per: 47 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 average slice is the ideal 13 msecs and the period is picture-perfect 40 msecs. But the 'ran' field fluctuates around 13.33 msecs and there's no mechanism in CFS to keep that from happening: it's a perfectly valid solution that CFS finds. to fix this we add a granularity/preemption rule that knows about the "target latency", which makes tasks that run longer than the ideal latency run a bit less. The simplest approach is to simply decrease the preemption granularity when a task overruns its ideal latency. For this we have to track how much the task executed since its last preemption. ( this adds a new field to task_struct, but we can eliminate that overhead in 2.6.24 by putting all the scheduler timestamps into an anonymous union. ) with this change in place, chew-max output is fluctuation-less all around: out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 this patch has no impact on any fastpath or on any globally observable scheduling property. (unless you have sharp enough eyes to see millisecond-level ruckles in glxgears smoothness :-) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith --- kernel/sched.c | 1 + kernel/sched_fair.c | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9fe473a190de..b533d6db78aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1587,6 +1587,7 @@ static void __sched_fork(struct task_struct *p) p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9f53d49f3aab..721fe7744874 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -668,7 +668,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) /* * Preempt the current task with a newly woken task if needed: */ -static void +static int __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { @@ -679,8 +679,11 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: */ - if (__delta > niced_granularity(curr, granularity)) + if (__delta > niced_granularity(curr, granularity)) { resched_task(rq_of(cfs_rq)->curr); + return 1; + } + return 0; } static inline void @@ -725,6 +728,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { + unsigned long gran, ideal_runtime, delta_exec; struct sched_entity *next; /* @@ -741,8 +745,22 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, - sched_granularity(cfs_rq)); + gran = sched_granularity(cfs_rq); + ideal_runtime = niced_granularity(curr, + max(sysctl_sched_latency / cfs_rq->nr_running, + (unsigned long)sysctl_sched_min_granularity)); + /* + * If we executed more than what the latency constraint suggests, + * reduce the rescheduling granularity. This way the total latency + * of how much a task is not scheduled converges to + * sysctl_sched_latency: + */ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + gran = 0; + + if (__check_preempt_curr_fair(cfs_rq, next, curr, gran)) + curr->prev_sum_exec_runtime = curr->sum_exec_runtime; } /************************************************** -- cgit v1.2.3 From 7109c4429af3640f79a638f177fc5d05b9807149 Mon Sep 17 00:00:00 2001 From: Ting Yang Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: call update_curr() in task_tick_fair() update the fair-clock before using it for the key value. [ mingo@elte.hu: small cleanups. ] Signed-off-by: Ting Yang Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 721fe7744874..9f06094e5275 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1094,10 +1094,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se; + struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); sched_info_queued(p); + update_curr(cfs_rq); update_stats_enqueue(cfs_rq, se); /* * Child runs first: we let it run before the parent @@ -1105,7 +1106,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * it will preempt the parent: */ p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; + niced_granularity(curr, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: -- cgit v1.2.3 From b77d69db9f4ba03b2ed17e383c2d73ca89f5ab14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: fix wait_start_fair condition in update_stats_wait_end() Peter Zijlstra noticed the following bug in SCHED_FEAT_SKIP_INITIAL (which is disabled by default at the moment): it relies on se.wait_start_fair being 0 while update_stats_wait_end() did not recognize a 0 value, so instead of 'skipping' the initial interval we gave the new child a maximum boost of +runtime-limit ... (No impact on the default kernel, but nice to fix for completeness.) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9f06094e5275..0c718857176f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -489,6 +489,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long delta_fair; + if (unlikely(!se->wait_start_fair)) + return; + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->wait_start_fair)); -- cgit v1.2.3 From 213c8af67f21c1dc0d50940b159d9521c95f3c89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: small schedstat fix small schedstat fix: the cfs_rq->wait_runtime 'sum of all runtimes' statistics counters missed newly forked tasks and thus had a constant negative skew. Fix this. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith --- kernel/sched_fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c718857176f..75f025da6f7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1121,8 +1121,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) + if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) { p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); + } __enqueue_entity(cfs_rq, se); } -- cgit v1.2.3 From 9f508f8258e18e9333f18daf1f0860df48d49ed2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: clean up task_new_fair() cleanup: we have the 'se' and 'curr' entity-pointers already, no need to use p->se and current->se. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 75f025da6f7c..ce39282d9c0d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1108,21 +1108,21 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * until it reschedules once. We set up the key so that * it will preempt the parent: */ - p->se.fair_key = current->se.fair_key - + se->fair_key = curr->fair_key - niced_granularity(curr, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: */ if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - p->se.wait_start_fair = 0; + se->wait_start_fair = 0; /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) { - p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); + se->wait_runtime = -(sched_granularity(cfs_rq) / 2); schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } -- cgit v1.2.3 From f2ab6d8889422c1f5354f014e8bef337b1d1bade Mon Sep 17 00:00:00 2001 From: Jonathan Lim Date: Thu, 30 Aug 2007 23:56:23 -0700 Subject: Assign task_struct.exit_code before taskstats_exit() taskstats.ac_exitcode is assigned to task_struct.exit_code in bacct_add_tsk() through the following kernel function calls: do_exit() taskstats_exit() fill_pid() bacct_add_tsk() The problem is that in do_exit(), task_struct.exit_code is set to 'code' only after taskstats_exit() has been called. So we need to move the assignment before taskstats_exit(). Signed-off-by: Jonathan Lim Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9578c1ae19ca..06b24b3aa370 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -975,6 +975,7 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); @@ -996,7 +997,6 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->binfmt) module_put(tsk->binfmt->module); - tsk->exit_code = code; proc_exit_connector(tsk); exit_task_namespaces(tsk); exit_notify(tsk); -- cgit v1.2.3 From b07e35f94a7b6a059f889b904529ee907dc0634d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 30 Aug 2007 23:56:27 -0700 Subject: setpgid(child) fails if the child was forked by sub-thread Spotted by Marcin Kowalczyk . sys_setpgid(child) fails if the child was forked by sub-thread. Fix the "is it our child" check. The previous commit ee0acf90d320c29916ba8c5c1b2e908d81f5057d was not complete. (this patch asks for the new same_thread_group() helper, but mainline doesn't have it yet). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Tested-by: "Marcin 'Qrczak' Kowalczyk" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 449b81b98b3d..1b33b05d346b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1442,7 +1442,6 @@ asmlinkage long sys_times(struct tms __user * tbuf) * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. * LBT 04.03.94 */ - asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; @@ -1470,7 +1469,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->real_parent == group_leader) { + if (p->real_parent->tgid == group_leader->tgid) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; -- cgit v1.2.3 From f3de4be9d5f8551d7880a1f1f5231a30e0161b1f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 30 Aug 2007 23:56:29 -0700 Subject: PM: Fix dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION Dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION introduced by commit 296699de6bdc717189a331ab6bbe90e05c94db06 "Introduce CONFIG_SUSPEND for suspend-to-Ram and standby" are incorrect, as they don't cover the facts that (1) not all architectures support suspend and (2) SMP hibernation is only possible on X86 and PPC64 (if CONFIG_PPC64_SWSUSP is set). Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 4 ++-- kernel/power/Kconfig | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 181ae7086029..38033db8d8ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu) return err; } -#ifdef CONFIG_SUSPEND_SMP +#ifdef CONFIG_PM_SLEEP_SMP static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) @@ -334,4 +334,4 @@ void enable_nonboot_cpus(void) out: mutex_unlock(&cpu_add_remove_lock); } -#endif +#endif /* CONFIG_PM_SLEEP_SMP */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 412859f8d94a..c8580a1e6873 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -72,15 +72,10 @@ config PM_TRACE CAUTION: this option will cause your machine's real-time clock to be set to an invalid time after a resume. -config SUSPEND_SMP_POSSIBLE - bool - depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC)) - depends on SMP - default y - -config SUSPEND_SMP +config PM_SLEEP_SMP bool - depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP + depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE + depends on PM_SLEEP select HOTPLUG_CPU default y @@ -89,20 +84,46 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION default y +config SUSPEND_UP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ + || SUPERH || FRV + depends on !SMP + default y + +config SUSPEND_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) \ + || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM + depends on SMP + default y + config SUSPEND bool "Suspend to RAM and standby" depends on PM - depends on !SMP || SUSPEND_SMP_POSSIBLE + depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (i.e. the ACPI S3 state). +config HIBERNATION_UP_POSSIBLE + bool + depends on X86 || PPC64_SWSUSP || FRV || PPC32 + depends on !SMP + default y + +config HIBERNATION_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP + depends on SMP + default y + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP - depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE + depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the -- cgit v1.2.3 From 59845b1ffd9121e5ef474ea5f27405fd7a83c85b Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 30 Aug 2007 23:56:34 -0700 Subject: request_irq: fix DEBUG_SHIRQ handling Mariusz Kozlowski reported lockdep's warning: > ================================= > [ INFO: inconsistent lock state ] > 2.6.23-rc2-mm1 #7 > --------------------------------- > inconsistent {in-hardirq-W} -> {hardirq-on-W} usage. > ifconfig/5492 [HC0[0]:SC0[0]:HE1:SE1] takes: > (&tp->lock){+...}, at: [] rtl8139_interrupt+0x27/0x46b [8139too] > {in-hardirq-W} state was registered at: > [] __lock_acquire+0x949/0x11ac > [] lock_acquire+0x99/0xb2 > [] _spin_lock+0x35/0x42 > [] rtl8139_interrupt+0x27/0x46b [8139too] > [] handle_IRQ_event+0x28/0x59 > [] handle_level_irq+0xad/0x10b > [] do_IRQ+0x93/0xd0 > [] common_interrupt+0x2e/0x34 ... > other info that might help us debug this: > 1 lock held by ifconfig/5492: > #0: (rtnl_mutex){--..}, at: [] mutex_lock+0x1c/0x1f > > stack backtrace: ... > [] _spin_lock+0x35/0x42 > [] rtl8139_interrupt+0x27/0x46b [8139too] > [] free_irq+0x11b/0x146 > [] rtl8139_close+0x8a/0x14a [8139too] > [] dev_close+0x57/0x74 ... This shows that a driver's irq handler was running both in hard interrupt and process contexts with irqs enabled. The latter was done during free_irq() call and was possible only with CONFIG_DEBUG_SHIRQ enabled. This was fixed by another patch. But similar problem is possible with request_irq(): any locks taken from irq handler could be vulnerable - especially with soft interrupts. This patch fixes it by disabling local interrupts during handler's run. (It seems, disabling softirqs should be enough, but it needs more checking on possible races or other special cases). Reported-by: Mariusz Kozlowski Signed-off-by: Jarek Poplawski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 853aefbd184b..7230d914eaa2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -547,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, * We do this before actually registering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ - if (irqflags & IRQF_DISABLED) { - unsigned long flags; + unsigned long flags; - local_irq_save(flags); - handler(irq, dev_id); - local_irq_restore(flags); - } else - handler(irq, dev_id); + local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); } #endif -- cgit v1.2.3 From 99db67bc04af0f2e8cb710ac92aaeb9af135a7c6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 30 Aug 2007 23:56:34 -0700 Subject: userns: don't leak root user Signed-off-by: Alexey Dobriyan Acked-by: Cedric Le Goater Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d055d987850c..85af9422ea6e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); + free_uid(ns->root_user); kfree(ns); } -- cgit v1.2.3 From 60187d2708caa870f0825d753df1612ea688eb9e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 30 Aug 2007 23:56:35 -0700 Subject: sigqueue_free: fix the race with collect_signal() Spotted by taoyue and Jeremy Katz . collect_signal: sigqueue_free: list_del_init(&first->list); if (!list_empty(&q->list)) { // not taken } q->flags &= ~SIGQUEUE_PREALLOC; __sigqueue_free(first); __sigqueue_free(q); Now, __sigqueue_free() is called twice on the same "struct sigqueue" with the obviously bad implications. In particular, this double free breaks the array_cache->avail logic, so the same sigqueue could be "allocated" twice, and the bug can manifest itself via the "impossible" BUG_ON(!SIGQUEUE_PREALLOC) in sigqueue_free/send_sigqueue. Hopefully this can explain these mysterious bug-reports, see http://marc.info/?t=118766926500003 http://marc.info/?t=118466273000005 Alexey Dobriyan reports this patch makes the difference for the testcase, but nobody has an access to the application which opened the problems originally. Also, this patch removes tasklist lock/unlock, ->siglock is enough. Signed-off-by: Oleg Nesterov Cc: taoyue Cc: Jeremy Katz Cc: Sukadev Bhattiprolu Cc: Alexey Dobriyan Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Roland McGrath Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ad63109e413c..3169bed0b4d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1300,20 +1300,19 @@ struct sigqueue *sigqueue_alloc(void) void sigqueue_free(struct sigqueue *q) { unsigned long flags; + spinlock_t *lock = ¤t->sighand->siglock; + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* * If the signal is still pending remove it from the - * pending queue. + * pending queue. We must hold ->siglock while testing + * q->list to serialize with collect_signal(). */ - if (unlikely(!list_empty(&q->list))) { - spinlock_t *lock = ¤t->sighand->siglock; - read_lock(&tasklist_lock); - spin_lock_irqsave(lock, flags); - if (!list_empty(&q->list)) - list_del_init(&q->list); - spin_unlock_irqrestore(lock, flags); - read_unlock(&tasklist_lock); - } + spin_lock_irqsave(lock, flags); + if (!list_empty(&q->list)) + list_del_init(&q->list); + spin_unlock_irqrestore(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; __sigqueue_free(q); } -- cgit v1.2.3 From 7fd0d2dde929ead79901e389e70dbfb3c6c06986 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 5 Sep 2007 14:32:48 +0200 Subject: sched: fix MC/HT scheduler optimization, without breaking the FUZZ logic. First fix the check if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) with this if (*imbalance < busiest_load_per_task) As the current check is always false for nice 0 tasks (as SCHED_LOAD_SCALE_FUZZ is same as busiest_load_per_task for nice 0 tasks). With the above change, imbalance was getting reset to 0 in the corner case condition, making the FUZZ logic fail. Fix it by not corrupting the imbalance and change the imbalance, only when it finds that the HT/MC optimization is needed. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b533d6db78aa..c8759ec6d8a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2512,7 +2512,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2564,10 +2564,8 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; -- cgit v1.2.3 From a0dc72601d48b171b4870dfdd0824901a2b2b1a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: fix niced_granularity() shift fix niced_granularity(). This resulted in under-scheduling for CPU-bound negative nice level tasks (and this in turn caused higher than necessary latencies in nice-0 tasks). Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ce39282d9c0d..810b52d994e0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -291,7 +291,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity) /* * It will always fit into 'long': */ - return (long) (tmp >> WMULT_SHIFT); + return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); } static inline void -- cgit v1.2.3 From a206c07213cf6372289f189c3774c4c3255a7ae1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: debug: fix cfs_rq->wait_runtime accounting the cfs_rq->wait_runtime debug/statistics counter was not maintained properly - fix this. this also removes some code: text data bss dec hex filename 13420 228 1204 14852 3a04 sched.o.before 13404 228 1204 14836 39f4 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 1 - kernel/sched_fair.c | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c8759ec6d8a9..97986f1f0be8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -858,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) static void set_load_weight(struct task_struct *p) { - task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; p->se.wait_runtime = 0; if (task_has_rt_policy(p)) { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 810b52d994e0..bac2aff8273c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -194,6 +194,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static inline void @@ -205,6 +207,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + + schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -574,7 +578,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) prev_runtime = se->wait_runtime; __add_wait_runtime(cfs_rq, se, delta_fair); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); delta_fair = se->wait_runtime - prev_runtime; /* @@ -662,7 +665,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; } - cfs_rq->wait_runtime -= se->wait_runtime; #endif } __dequeue_entity(cfs_rq, se); @@ -1121,10 +1123,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) { + if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) se->wait_runtime = -(sched_granularity(cfs_rq) / 2); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); - } __enqueue_entity(cfs_rq, se); } -- cgit v1.2.3 From 2491b2b89d4646e02ab51c90ab7012d124924ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: debug: fix sum_exec_runtime clearing when cleaning sched-stats also clear prev_sum_exec_runtime. Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ab18f45f2ab2..c3ee38bd3426 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -283,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p) p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; #endif p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; } -- cgit v1.2.3 From cf2ab4696ee42f895eed88c2b6e432fe03dda0db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: fix xtensa build warning rename RSR to SRR - 'RSR' is already defined on xtensa. found by Adrian Bunk. Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 97986f1f0be8..deeb1f8e0c30 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) /* * Shift right and round: */ -#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, @@ -684,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, * Check whether we'd overflow the 64-bit multiplication: */ if (unlikely(tmp > WMULT_CONST)) - tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, WMULT_SHIFT/2); else - tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -- cgit v1.2.3 From 7c92e54f6f9601cfa9d8894ee248abcf62ed9a1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: simplify __check_preempt_curr_fair() Preparatory patch for fix-ideal-runtime: simplify __check_preempt_curr_fair(): get rid of the integer return. text data bss dec hex filename 13404 228 1204 14836 39f4 sched.o.before 13393 228 1204 14825 39e9 sched.o.after functionality is unchanged. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bac2aff8273c..f0dd4be1a3a4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -673,7 +673,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) /* * Preempt the current task with a newly woken task if needed: */ -static int +static void __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { @@ -686,9 +686,8 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, */ if (__delta > niced_granularity(curr, granularity)) { resched_task(rq_of(cfs_rq)->curr); - return 1; + curr->prev_sum_exec_runtime = curr->sum_exec_runtime; } - return 0; } static inline void @@ -764,8 +763,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (delta_exec > ideal_runtime) gran = 0; - if (__check_preempt_curr_fair(cfs_rq, next, curr, gran)) - curr->prev_sum_exec_runtime = curr->sum_exec_runtime; + __check_preempt_curr_fair(cfs_rq, next, curr, gran); } /************************************************** -- cgit v1.2.3 From 4a55b45036a677fac43fe81ddf7fdcd007aaaee7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: improve prev_sum_exec_runtime setting Second preparatory patch for fix-ideal runtime: Mark prev_sum_exec_runtime at the beginning of our run, the same spot that adds our wait period to wait_runtime. This seems a more natural location to do this, and it also reduces the code a bit: text data bss dec hex filename 13397 228 1204 14829 39ed sched.o.before 13391 228 1204 14823 39e7 sched.o.after Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f0dd4be1a3a4..2d01bbc2d04a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -684,10 +684,8 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: */ - if (__delta > niced_granularity(curr, granularity)) { + if (__delta > niced_granularity(curr, granularity)) resched_task(rq_of(cfs_rq)->curr); - curr->prev_sum_exec_runtime = curr->sum_exec_runtime; - } } static inline void @@ -703,6 +701,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(cfs_rq, se); update_stats_curr_start(cfs_rq, se); set_cfs_rq_curr(cfs_rq, se); + se->prev_sum_exec_runtime = se->sum_exec_runtime; } static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -- cgit v1.2.3 From 1169783085adb9ac969d21103a6885e8435f7ed3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: fix ideal_runtime calculations for reniced tasks fix ideal_runtime: - do not scale it using niced_granularity() it is against sum_exec_delta, so its wall-time, not fair-time. - move the whole check into __check_preempt_curr_fair() so that wakeup preemption can also benefit from the new logic. this also results in code size reduction: text data bss dec hex filename 13391 228 1204 14823 39e7 sched.o.before 13369 228 1204 14801 39d1 sched.o.after Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2d01bbc2d04a..892616bf2c77 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -678,11 +678,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { s64 __delta = curr->fair_key - se->fair_key; + unsigned long ideal_runtime, delta_exec; + + /* + * ideal_runtime is compared against sum_exec_runtime, which is + * walltime, hence do not scale. + */ + ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, + (unsigned long)sysctl_sched_min_granularity); + + /* + * If we executed more than what the latency constraint suggests, + * reduce the rescheduling granularity. This way the total latency + * of how much a task is not scheduled converges to + * sysctl_sched_latency: + */ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + granularity = 0; /* * Take scheduling granularity into account - do not * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: + * + * scale granularity as key space is in fair_clock. */ if (__delta > niced_granularity(curr, granularity)) resched_task(rq_of(cfs_rq)->curr); @@ -731,7 +751,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - unsigned long gran, ideal_runtime, delta_exec; struct sched_entity *next; /* @@ -748,21 +767,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - gran = sched_granularity(cfs_rq); - ideal_runtime = niced_granularity(curr, - max(sysctl_sched_latency / cfs_rq->nr_running, - (unsigned long)sysctl_sched_min_granularity)); - /* - * If we executed more than what the latency constraint suggests, - * reduce the rescheduling granularity. This way the total latency - * of how much a task is not scheduled converges to - * sysctl_sched_latency: - */ - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) - gran = 0; - - __check_preempt_curr_fair(cfs_rq, next, curr, gran); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** -- cgit v1.2.3 From 7d94143291e4e625e2bc3b1ebdc7143ee7a9a2f1 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 5 Sep 2007 03:05:56 -0700 Subject: Fix spurious syscall tracing after PTRACE_DETACH + PTRACE_ATTACH When PTRACE_SYSCALL was used and then PTRACE_DETACH is used, the TIF_SYSCALL_TRACE flag is left set on the formerly-traced task. This means that when a new tracer comes along and does PTRACE_ATTACH, it's possible he gets a syscall tracing stop even though he's never used PTRACE_SYSCALL. This happens if the task was in the middle of a system call when the second PTRACE_ATTACH was done. The symptom is an unexpected SIGTRAP when the tracer thinks that only SIGSTOP should have been provoked by his ptrace calls so far. A few machines already fixed this in ptrace_disable (i386, ia64, m68k). But all other machines do not, and still have this bug. On x86_64, this constitutes a regression in IA32 compatibility support. Since all machines now use TIF_SYSCALL_TRACE for this, I put the clearing of TIF_SYSCALL_TRACE in the generic ptrace_detach code rather than adding it to every other machine's ptrace_disable. Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 82a558b655da..3eca7a55f2ee 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* protect against de_thread()->release_task() */ -- cgit v1.2.3 From 179c85ea53bef807621f335767e41e23f86f01df Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 11 Sep 2007 15:23:49 -0700 Subject: futex_compat: fix list traversal bugs The futex list traversal on the compat side appears to have a bug. It's loop termination condition compares: while (compat_ptr(uentry) != &head->list) But that can't be right because "uentry" has the special "pi" indicator bit still potentially set at bit 0. This is cleared by fetch_robust_entry() into the "entry" return value. What this seems to mean is that the list won't terminate when list iteration gets back to the the head. And we'll also process the list head like a normal entry, which could cause all kinds of problems. So we should check for equality with "entry". That pointer is of the non-compat type so we have to do a little casting to keep the compiler and sparse happy. The same problem can in theory occur with the 'pending' variable, although that has not been reported from users so far. Based on the original patch from David Miller. Acked-by: Ingo Molnar Cc: Thomas Gleixner Cc: David Miller Signed-off-by: Arnd Bergmann Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f7921360efad..7e52eb051f22 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -61,10 +61,10 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (upending) + if (pending) handle_futex_death((void __user *)pending + futex_offset, curr, pip); - while (compat_ptr(uentry) != &head->list) { + while (entry != (struct robust_list __user *) &head->list) { /* * A pending lock might already be on the list, so * dont process it twice: -- cgit v1.2.3 From 3210f0ecdba6a81c3f8efe6f442d2e1f57db98f9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 11 Sep 2007 15:23:51 -0700 Subject: Restore call_usermodehelper_pipe() behaviour The semantics of call_usermodehelper_pipe() used to be that it would fork the helper, and wait for the kernel thread to be started. This was implemented by setting sub_info.wait to 0 (implicitly), and doing a wait_for_completion(). As part of the cleanup done in 0ab4dc92278a0f3816e486d6350c6652a72e06c8, call_usermodehelper_pipe() was changed to pass 1 as the value for wait to call_usermodehelper_exec(). This is equivalent to setting sub_info.wait to 1, which is a change from the previous behaviour. Using 1 instead of 0 causes __call_usermodehelper() to start the kernel thread running wait_for_helper(), rather than directly calling ____call_usermodehelper(). The end result is that the calling kernel code blocks until the user mode helper finishes. As the helper is expecting input on stdin, and now no one is writing anything, everything locks up (observed in do_coredump). The fix is to change the 1 to UMH_WAIT_EXEC (aka 0), indicating that we want to wait for the kernel thread to be started, but not for the helper to finish. Signed-off-by: Michael Ellerman Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 9809cc1f33d6..c6a4f8aebeba 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, if (ret < 0) goto out; - return call_usermodehelper_exec(sub_info, 1); + return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); out: call_usermodehelper_freeinfo(sub_info); -- cgit v1.2.3 From 298a5df45d497e66064fda22ef0abf13766d3333 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Tue, 11 Sep 2007 15:24:03 -0700 Subject: Fix "no_sync_cmos_clock" logic inversion in kernel/time/ntp.c Seems to me that this timer will only get started on platforms that say they don't want it? Signed-off-by: Tony Breeds Cc: Paul Mackerras Cc: Gabriel Paubert Cc: Zachary Amsden Acked-by: Thomas Gleixner Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cd91237dbfe3..de6a2d6b3ebb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy) static void notify_cmos_timer(void) { - if (no_sync_cmos_clock) + if (!no_sync_cmos_clock) mod_timer(&sync_cmos_timer, jiffies + 1); } -- cgit v1.2.3 From 3be9095063885d482b87d3875ea7f28e635882d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Sep 2007 15:36:43 +0200 Subject: timekeeping: access rtc outside of xtime lock Lockdep complains about the access of rtc in timekeeping_suspend inside the interrupt disabled region of the write locked xtime lock. Move the access outside. Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index acc417b5a9b7..f682091fa890 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -325,9 +325,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + timekeeping_suspend_time = read_persistent_clock(); + write_seqlock_irqsave(&xtime_lock, flags); timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); -- cgit v1.2.3 From 6a669ee8a790487b7ec1edda762d39615a78264b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Sep 2007 15:36:43 +0200 Subject: timekeeping: Prevent time going backwards on resume Timekeeping resume adjusts xtime by adding the slept time in seconds and resets the reference value of the clock source (clock->cycle_last). clock->cycle last is used to calculate the delta between the last xtime update and the readout of the clock source in __get_nsec_offset(). xtime plus the offset is the current time. The resume code ignores the delta which had already elapsed between the last xtime update and the actual time of suspend. If the suspend time is short, then we can see time going backwards on resume. Suspend: offs_s = clock->read() - clock->cycle_last; now = xtime + offs_s; timekeeping_suspend_time = read_rtc(); Resume: sleep_time = read_rtc() - timekeeping_suspend_time; xtime.tv_sec += sleep_time; clock->cycle_last = clock->read(); offs_r = clock->read() - clock->cycle_last; now = xtime + offs_r; if sleep_time_seconds == 0 and offs_r < offs_s, then time goes backwards. Fix this by storing the offset from the last xtime update and add it to xtime during resume, when we reset clock->cycle_last: sleep_time = read_rtc() - timekeeping_suspend_time; xtime.tv_sec += sleep_time; xtime += offs_s; /* Fixup xtime offset at suspend time */ clock->cycle_last = clock->read(); offs_r = clock->read() - clock->cycle_last; now = xtime + offs_r; Thanks to Marcelo for tracking this down on the OLPC and providing the necessary details to analyze the root cause. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Tosatti --- kernel/time/timekeeping.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f682091fa890..4ad79f6bdec6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -217,6 +217,7 @@ static void change_clocksource(void) } #else static inline void change_clocksource(void) { } +static inline s64 __get_nsec_offset(void) { return 0; } #endif /** @@ -280,6 +281,8 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; +/* xtime offset when we went into suspend */ +static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } + /* Make sure that we have the correct xtime reference */ + timespec_add_ns(&xtime, timekeeping_suspend_nsecs); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; @@ -328,6 +333,8 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); + /* Get the current xtime offset */ + timekeeping_suspend_nsecs = __get_nsec_offset(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); -- cgit v1.2.3 From 07eec6af448d13a6a520d9c6f06f2e87f61b567a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Sep 2007 15:36:43 +0200 Subject: clockevents: Enforce oneshot broadcast when broadcast mask is set on resume The jinxed VAIO refuses to resume without hitting keys on the keyboard when this is not enforced. It is unclear why the cpu ends up in a lower C State without notifying the clock events layer, but enforcing the oneshot broadcast here is safe. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index db8e0f3d409b..947959fb2bb5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -382,12 +382,23 @@ static int tick_broadcast_set_event(ktime_t expires, int force) int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { + int cpu = smp_processor_id(); + + /* + * If the CPU is marked for broadcast, enforce oneshot + * broadcast mode. The jinxed VAIO does not resume otherwise. + * No idea why it ends up in a lower C State during resume + * without notifying the clock events layer. + */ + if (cpu_isset(cpu, tick_broadcast_mask)) + cpu_set(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); if(!cpus_empty(tick_broadcast_oneshot_mask)) tick_broadcast_set_event(ktime_get(), 1); - return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask); + return cpu_isset(cpu, tick_broadcast_oneshot_mask); } /* -- cgit v1.2.3 From 31d9b3938c0459e5e9755ce0a98ac1e24eeff972 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Sep 2007 15:36:43 +0200 Subject: clockevents: do not shutdown the oneshot broadcast device When a cpu goes offline it is removed from the broadcast masks. If the mask becomes empty the code shuts down the broadcast device. This is wrong, because the broadcast device needs to be ready for the online cpu going idle (into a c-state, which stops the local apic timer). Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 947959fb2bb5..aab881c86a1a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -560,20 +560,17 @@ void tick_broadcast_switch_to_oneshot(void) */ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { - struct clock_event_device *bc; unsigned long flags; unsigned int cpu = *cpup; spin_lock_irqsave(&tick_broadcast_lock, flags); - bc = tick_broadcast_device.evtdev; + /* + * Clear the broadcast mask flag for the dead cpu, but do not + * stop the broadcast device! + */ cpu_clear(cpu, tick_broadcast_oneshot_mask); - if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { - if (bc && cpus_empty(tick_broadcast_oneshot_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From 5e41d0d60a534d2a5dc9772600a58f44c8d12506 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Sep 2007 15:36:43 +0200 Subject: clockevents: prevent stale tick update on offline cpu Taking a cpu offline removes the cpu from the online mask before the CPU_DEAD notification is done. The clock events layer does the cleanup of the dead CPU from the CPU_DEAD notifier chain. tick_do_timer_cpu is used to avoid xtime lock contention by assigning the task of jiffies xtime updates to one CPU. If a CPU is taken offline, then this assignment becomes stale. This went unnoticed because most of the time the offline CPU went dead before the online CPU reached __cpu_die(), where the CPU_DEAD state is checked. In the case that the offline CPU did not reach the DEAD state before we reach __cpu_die(), the code in there goes to sleep for 100ms. Due to the stale time update assignment, the system is stuck forever. Take the assignment away when a cpu is not longer in the cpu_online_mask. We do this in the last call to tick_nohz_stop_sched_tick() when the offline CPU is on the way to the final play_dead() idle entry. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b416995b9757..8c3fef1db09c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + } + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; -- cgit v1.2.3 From efc63c4fb0f95865907472d1c6bc0cfea9ee156b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 18 Sep 2007 22:46:27 -0700 Subject: Fix UTS corruption during clone(CLONE_NEWUTS) struct utsname is copied from master one without any exclusion. Here is sample output from one proggie doing sethostname("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); sethostname("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"); and another clone(,, CLONE_NEWUTS, ...) uname() hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaabbbbb' hostname = 'bbbaaaaaaaaaaaaaaaaaaaaaaaaaaa' hostname = 'aaaaaaaabbbbbbbbbbbbbbbbbbbbbb' hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaaabbbb' hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaabb' hostname = 'aaabbbbbbbbbbbbbbbbbbbbbbbbbbb' hostname = 'bbbbbbbbbbbbbbbbaaaaaaaaaaaaaa' Hostname is sometimes corrupted. Yes, even _the_ simplest namespace activity had bug in it. :-( Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/utsname.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/utsname.c b/kernel/utsname.c index 9d8180a0f0d8..816d7b24fa03 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) if (!ns) return ERR_PTR(-ENOMEM); + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + up_read(&uts_sem); kref_init(&ns->kref); return ns; } -- cgit v1.2.3 From d8a4821dca693867a7953104c1e3cc830eb9191f Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 18 Sep 2007 22:46:43 -0700 Subject: kernel/user.c: Use list_for_each_entry instead of list_for_each kernel/user.c: Convert list_for_each to list_for_each_entry in uid_hash_find() Signed-off-by: Matthias Kaehlcke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index e7d11cef6998..e080ba863ae3 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -67,13 +67,9 @@ static inline void uid_hash_remove(struct user_struct *up) static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) { - struct list_head *up; - - list_for_each(up, hashent) { - struct user_struct *user; - - user = list_entry(up, struct user_struct, uidhash_list); + struct user_struct *user; + list_for_each_entry(user, hashent, uidhash_list) { if(user->uid == uid) { atomic_inc(&user->__count); return user; -- cgit v1.2.3 From 735de2230f09741077a645a913de0a04b10208bf Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 18 Sep 2007 22:46:44 -0700 Subject: Convert uid hash to hlist Surprisingly, but (spotted by Alexey Dobriyan) the uid hash still uses list_heads, thus occupying twice as much place as it could. Convert it to hlist_heads. Signed-off-by: Pavel Emelyanov Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 15 ++++++++------- kernel/user_namespace.c | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index e080ba863ae3..add57c7e4c07 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,21 +55,22 @@ struct user_struct root_user = { /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { - list_add(&up->uidhash_list, hashent); + hlist_add_head(&up->uidhash_node, hashent); } static inline void uid_hash_remove(struct user_struct *up) { - list_del(&up->uidhash_list); + hlist_del(&up->uidhash_node); } -static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) { struct user_struct *user; + struct hlist_node *h; - list_for_each_entry(user, hashent, uidhash_list) { + hlist_for_each_entry(user, h, hashent, uidhash_node) { if(user->uid == uid) { atomic_inc(&user->__count); return user; @@ -118,7 +119,7 @@ void free_uid(struct user_struct *up) struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(ns, uid); + struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -207,7 +208,7 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(init_user_ns.uidhash_table + n); + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 85af9422ea6e..e7ba1bf8457c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) kref_init(&ns->kref); for (n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(ns->uidhash_table + n); + INIT_HLIST_HEAD(ns->uidhash_table + n); /* Insert new root user. */ ns->root_user = alloc_uid(ns, 0); -- cgit v1.2.3 From 28f300d23674fa01ae747c66ce861d4ee6aebe8c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 18 Sep 2007 22:46:45 -0700 Subject: Fix user namespace exiting OOPs It turned out, that the user namespace is released during the do_exit() in exit_task_namespaces(), but the struct user_struct is released only during the put_task_struct(), i.e. MUCH later. On debug kernels with poisoned slabs this will cause the oops in uid_hash_remove() because the head of the chain, which resides inside the struct user_namespace, will be already freed and poisoned. Since the uid hash itself is required only when someone can search it, i.e. when the namespace is alive, we can safely unhash all the user_struct-s from it during the namespace exiting. The subsequent free_uid() will complete the user_struct destruction. For example simple program #include char stack[2 * 1024 * 1024]; int f(void *foo) { return 0; } int main(void) { clone(f, stack + 1 * 1024 * 1024, 0x10000000, 0); return 0; } run on kernel with CONFIG_USER_NS turned on will oops the kernel immediately. This was spotted during OpenVZ kernel testing. Signed-off-by: Pavel Emelyanov Signed-off-by: Alexey Dobriyan Acked-by: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 26 +++++++++++++++++++++++++- kernel/user_namespace.c | 2 +- 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index add57c7e4c07..9ca2848fc356 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -62,7 +62,7 @@ static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *ha static inline void uid_hash_remove(struct user_struct *up) { - hlist_del(&up->uidhash_node); + hlist_del_init(&up->uidhash_node); } static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) @@ -199,6 +199,30 @@ void switch_uid(struct user_struct *new_user) suid_keys(current); } +void release_uids(struct user_namespace *ns) +{ + int i; + unsigned long flags; + struct hlist_head *head; + struct hlist_node *nd; + + spin_lock_irqsave(&uidhash_lock, flags); + /* + * collapse the chains so that the user_struct-s will + * be still alive, but not in hashes. subsequent free_uid() + * will free them. + */ + for (i = 0; i < UIDHASH_SZ; i++) { + head = ns->uidhash_table + i; + while (!hlist_empty(head)) { + nd = head->first; + hlist_del_init(nd); + } + } + spin_unlock_irqrestore(&uidhash_lock, flags); + + free_uid(ns->root_user); +} static int __init uid_cache_init(void) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e7ba1bf8457c..7af90fc4f0fd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -81,7 +81,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); - free_uid(ns->root_user); + release_uids(ns); kfree(ns); } -- cgit v1.2.3 From 1799e35d5baab6e06168b46cc78b968e728ea3d1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Sep 2007 23:34:46 +0200 Subject: sched: add /proc/sys/kernel/sched_compat_yield add /proc/sys/kernel/sched_compat_yield to make sys_sched_yield() more agressive, by moving the yielding task to the last position in the rbtree. with sched_compat_yield=0: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2539 mingo 20 0 1576 252 204 R 50 0.0 0:02.03 loop_yield 2541 mingo 20 0 1576 244 196 R 50 0.0 0:02.05 loop with sched_compat_yield=1: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2584 mingo 20 0 1576 248 196 R 99 0.0 0:52.45 loop 2582 mingo 20 0 1576 256 204 R 0 0.0 0:00.00 loop_yield Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 5 +---- kernel/sched_fair.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/sysctl.c | 8 +++++++ 3 files changed, 66 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index deeb1f8e0c30..63e0971c8fbb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4550,10 +4550,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) - schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 892616bf2c77..c9fbe8e73a45 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -42,6 +42,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; */ unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; + /* * SCHED_BATCH wake-up granularity. * (default: 25 msec, units: nanoseconds) @@ -897,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) } /* - * sched_yield() support is very simple - we dequeue and enqueue + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *rightmost, *se = &p->se; + struct rb_node *parent; - __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0); + enqueue_entity(cfs_rq, &p->se, 0); + + return; + } + /* + * Find the rightmost entry in the rbtree: */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + do { + parent = *link; + link = &parent->rb_right; + } while (*link); + + rightmost = rb_entry(parent, struct sched_entity, run_node); + /* + * Already in the rightmost position? + */ + if (unlikely(rightmost == se)) + return; + + /* + * Minimally necessary key value to be last in the tree: + */ + se->fair_key = rightmost->fair_key + 1; + + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + /* + * Relink the task to the rightmost position: + */ + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6ace893c17c9..53a456ebf6d5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3 From 9c95e7319ba98585ebb6d304eca2d56f401ed70c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 19 Sep 2007 23:34:46 +0200 Subject: sched: fix invalid sched_class use When using rt_mutex, a NULL pointer dereference is occurred at enqueue_task_rt. Here is a scenario; 1) there are two threads, the thread A is fair_sched_class and thread B is rt_sched_class. 2) Thread A is boosted up to rt_sched_class, because the thread A has a rt_mutex lock and the thread B is waiting the lock. 3) At this time, when thread A create a new thread C, the thread C has a rt_sched_class. 4) When doing wake_up_new_task() for the thread C, the priority of the thread C is out of the RT priority range, because the normal priority of thread A is not the RT priority. It makes data corruption by overflowing the rt_prio_array. The new thread C should be fair_sched_class. The new thread should be valid scheduler class before queuing. This patch fixes to set the suitable scheduler class. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63e0971c8fbb..6107a0cd6325 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1682,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->prio = effective_prio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || !current->se.on_rq) { -- cgit v1.2.3 From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Thu, 20 Sep 2007 12:40:16 -0700 Subject: signalfd simplification This simplifies signalfd code, by avoiding it to remain attached to the sighand during its lifetime. In this way, the signalfd remain attached to the sighand only during poll(2) (and select and epoll) and read(2). This also allows to remove all the custom "tsk == current" checks in kernel/signal.c, since dequeue_signal() will only be called by "current". I think this is also what Ben was suggesting time ago. The external effect of this, is that a thread can extract only its own private signals and the group ones. I think this is an acceptable behaviour, in that those are the signals the thread would be able to fetch w/out signalfd. Signed-off-by: Davide Libenzi Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 --------- kernel/fork.c | 2 +- kernel/signal.c | 8 +++----- 3 files changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 06b24b3aa370..993369ee94d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); - /* - * Notify that this sighand has been detached. This must - * be called with the tsk->sighand lock held. Also, this - * access tsk->sighand internally, so it must be called - * before tsk->sighand is reset. - */ - signalfd_detach_locked(tsk); - posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 7332e236d367..33f12f48684a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); + init_waitqueue_head(&sighand->signalfd_wqh); } void __init proc_caches_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 3169bed0b4d0..9fb91a32edda 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - if (likely(tsk == current)) - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); @@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } - if (likely(tsk == current)) - recalc_sigpending(); + recalc_sigpending(); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if (signr && likely(tsk == current) && + if (signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ /* -- cgit v1.2.3 From b7e113dc9d52c4a37d2da6fafe77959f3a28eccf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Sep 2007 22:29:06 +0000 Subject: clockevents: remove the suspend/resume workaround^Wthinko In a desparate attempt to fix the suspend/resume problem on Andrews VAIO I added a workaround which enforced the broadcast of the oneshot timer on resume. This was actually resolving the problem on the VAIO but was just a stupid workaround, which was not tackling the root cause: the assignement of lower idle C-States in the ACPI processor_idle code. The cpuidle patches, which utilize the dynamic tick feature and go faster into deeper C-states exposed the problem again. The correct solution is the previous patch, which prevents lower C-states across the suspend/resume. Remove the enforcement code, including the conditional broadcast timer arming, which helped to pamper over the real problem for quite a time. The oneshot broadcast flag for the cpu, which runs the resume code can never be set at the time when this code is executed. It only gets set, when the CPU is entering a lower idle C-State. Signed-off-by: Thomas Gleixner Tested-by: Andrew Morton Cc: Len Brown Cc: Venkatesh Pallipadi Cc: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/time/tick-broadcast.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aab881c86a1a..0962e0577660 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -382,23 +382,8 @@ static int tick_broadcast_set_event(ktime_t expires, int force) int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - int cpu = smp_processor_id(); - - /* - * If the CPU is marked for broadcast, enforce oneshot - * broadcast mode. The jinxed VAIO does not resume otherwise. - * No idea why it ends up in a lower C State during resume - * without notifying the clock events layer. - */ - if (cpu_isset(cpu, tick_broadcast_mask)) - cpu_set(cpu, tick_broadcast_oneshot_mask); - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - if(!cpus_empty(tick_broadcast_oneshot_mask)) - tick_broadcast_set_event(ktime_get(), 1); - - return cpu_isset(cpu, tick_broadcast_oneshot_mask); + return 0; } /* -- cgit v1.2.3 From 459685c75b82a0431da102365d507fdb72858b84 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Sep 2007 01:54:12 +0100 Subject: hibernation doesn't even build on frv - tons of helpers are missing Signed-off-by: Al Viro Acked-By: David Howells Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c8580a1e6873..14b0e10dc95c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -110,7 +110,7 @@ config SUSPEND config HIBERNATION_UP_POSSIBLE bool - depends on X86 || PPC64_SWSUSP || FRV || PPC32 + depends on X86 || PPC64_SWSUSP || PPC32 depends on !SMP default y -- cgit v1.2.3 From 4047727e5ae33f9b8d2b7766d1994ea6e5ec2991 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Mon, 1 Oct 2007 01:20:10 -0700 Subject: Fix SMP poweroff hangs We need to disable all CPUs other than the boot CPU (usually 0) before attempting to power-off modern SMP machines. This fixes the hang-on-poweroff issue on my MythTV SMP box, and also on Thomas Gleixner's new toybox. Signed-off-by: Mark Lord Acked-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 1b33b05d346b..8ae2e636eb1b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -878,6 +879,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); + disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); -- cgit v1.2.3 From 9f96cb1e8bca179a92afa40dfc3c49990f1cfc71 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 1 Oct 2007 01:20:13 -0700 Subject: robust futex thread exit race Calling handle_futex_death in exit_robust_list for the different robust mutexes of a thread basically frees the mutex. Another thread might grab the lock immediately which updates the next pointer of the mutex. fetch_robust_entry over the next pointer might therefore branch into the robust mutex list of a different thread. This can cause two problems: 1) some mutexes held by the dead thread are not getting freed and 2) some mutexs held by a different thread are freed. The next point need to be read before calling handle_futex_death. Signed-off-by: Martin Schwidefsky Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 26 ++++++++++++++++---------- kernel/futex_compat.c | 28 ++++++++++++++++++---------- 2 files changed, 34 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e8935b195e88..fcc94e7b4086 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1943,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -1965,11 +1966,13 @@ void exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); - + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: @@ -1978,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1991,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e52eb051f22..2c2e2954b713 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - compat_uptr_t uentry, upending; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); + next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: @@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t __user *)&entry->next, &pi)) + if (rc) return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) cond_resched(); } + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } asmlinkage long -- cgit v1.2.3 From 30084fbd1caa4b2e1a336fcdef60b68129d1d8f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Oct 2007 14:13:08 +0200 Subject: sched: fix profile=sleep fix sleep profiling - we lost this chunk in the CFS merge. Found-by: Mel Gorman Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c9fbe8e73a45..67c67a87146e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -639,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; + + /* + * Blocking time is in units of nanosecs, so shift by 20 to + * get a milliseconds-range estimation of the amount of + * time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), + delta >> 20); + } } #endif } -- cgit v1.2.3 From 74922be1485818ed368c4cf4f0b100f70bf01e08 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 7 Oct 2007 00:24:31 -0700 Subject: Fix timer_stats printout of events/sec When using /proc/timer_stats on ppc64 I noticed the events/sec field wasnt accurate. Sometimes the integer part was incorrect due to rounding (we werent taking the fractional seconds into consideration). The fraction part is also wrong, we need to pad the printf statement and take the bottom three digits of 1000 times the value. Signed-off-by: Anton Blanchard Acked-by: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timer_stats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 3c38fb5eae1b..c36bb7ed0301 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v) ms = 1; if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, - events / period.tv_sec, events * 1000 / ms); + seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", + events, events * 1000 / ms, + (events * 1000000 / ms) % 1000); else seq_printf(m, "%ld total events\n", events); -- cgit v1.2.3 From 291041e935e6d0513f2b7e4a300aa9f02ec1d925 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Oct 2007 00:24:36 -0700 Subject: fix bogus reporting of signals by audit Async signals should not be reported as sent by current in audit log. As it is, we call audit_signal_info() too early in check_kill_permission(). Note that check_kill_permission() has that test already - it needs to know if it should apply current-based permission checks. So the solution is to move the call of audit_signal_info() between those. Bogosity in question is easily reproduced - add a rule watching for e.g. kill(2) from specific process (so that audit_signal_info() would not short-circuit to nothing), say load_policy, watch the bogus OBJ_PID entry in audit logs claiming that write(2) on selinuxfs file issued by load_policy(8) had somehow managed to send a signal to syslogd... Signed-off-by: Al Viro Acked-by: Steve Grubb Acked-by: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9fb91a32edda..792952381092 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -531,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - - error = -EPERM; - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && ((sig != SIGCONT) || - (process_session(current) != process_session(t))) - && (current->euid ^ t->suid) && (current->euid ^ t->uid) - && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_KILL)) + if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) + return error; + error = -EPERM; + if (((sig != SIGCONT) || + (process_session(current) != process_session(t))) + && (current->euid ^ t->suid) && (current->euid ^ t->uid) + && (current->uid ^ t->suid) && (current->uid ^ t->uid) + && !capable(CAP_KILL)) return error; + } return security_task_kill(t, info, sig, 0); } -- cgit v1.2.3