From 2a01bb3885c9145dbb7583d5aa5f5d5504f6f46f Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 11 Apr 2012 08:15:29 -0400 Subject: panic: Make panic_on_oops configurable Several distros set this by default by patching panic_on_oops. It seems to fit with the BOOTPARAM_{HARD,SOFT}_PANIC options though, so let's add a Kconfig entry and reduce some more upstream delta. Signed-off-by: Kyle McMartin Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120411121529.GH26688@redacted.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..b6215b7ce99d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; -- cgit v1.2.3 From 62be73eafaa045d3233337303fb140f7f8a61135 Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Tue, 15 May 2012 17:35:09 -0400 Subject: kdump: Execute kmsg_dump(KMSG_DUMP_PANIC) after smp_send_stop() This patch moves kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop(), to serialize the crash-logging process via smp_send_stop() and to thus retrieve a more stable crash image of all CPUs stopped. Signed-off-by: Seiji Aguchi Acked-by: Don Zickus Cc: dle-develop@lists.sourceforge.net Cc: Satoru Moriya Cc: Tony Luck Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/5C4C569E8A4B9B42A84A977CF070A35B2E4D7A5CE2@USINDEVS01.corp.hds.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b6215b7ce99d..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,8 +108,6 @@ void panic(const char *fmt, ...) */ crash_kexec(NULL); - kmsg_dump(KMSG_DUMP_PANIC); - /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic @@ -117,6 +115,8 @@ void panic(const char *fmt, ...) */ smp_send_stop(); + kmsg_dump(KMSG_DUMP_PANIC); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); bust_spinlocks(0); -- cgit v1.2.3 From 23812b9d9e497580d38c62ebdc6f308733b0a32a Mon Sep 17 00:00:00 2001 From: Ning Jiang Date: Tue, 22 May 2012 00:19:20 +0800 Subject: genirq: Add IRQS_PENDING for nested and simple irq Every interrupt which is an active wakeup source needs the ability to abort suspend if there is a pending irq. Right now only edge and level irqs can do that. | +---------+ | INTC | +---------+ | GPIO_IRQ +------------+ | gpio-exp | +------------+ | | GPIO0_IRQ GPIO1_IRQ In the above diagram, gpio expander has irq number GPIO_IRQ, it is connected with two sub GPIO pins, GPIO0 and GPIO1. During suspend, we set IRQF_NO_SUSPEND for GPIO_IRQ so that gpio expander driver can handle the sub irq GPIO0_IRQ and GPIO1_IRQ, and these two irqs themselves can further be handled by simple or nested irq in some drivers(typically gpio and mfd driver). If they are used as wakeup sources during suspend, we want them to be able to abort suspend too. Setting IRQS_PENDING flag in handle_nested_irq() and handle_simple_irq() when the irq is disabled allows check_wakeup_irqs() to identify such irqs as source for aborting suspend. Signed-off-by: Ning Jiang Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/CAH3Oq6T905%2B3fkF43NAMMFvJvq7dsk_so6T2vQ8ZJrA5xiU3YA@mail.gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); -- cgit v1.2.3 From 818b0f3bfb236ae66cac3ff38e86b9e47f24b7aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 30 Mar 2012 23:11:34 +0800 Subject: genirq: Introduce irq_do_set_affinity() to reduce duplicated code All invocations of chip->irq_set_affinity() are doing the same return value checks. Let them all use a common function. [ tglx: removed the silly likely while at it ] Signed-off-by: Jiang Liu Cc: Jiang Liu Cc: Keping Chen Link: http://lkml.kernel.org/r/1333120296-13563-3-git-send-email-jiang.liu@huawei.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 39 ++++++++++++++++++++++----------------- kernel/irq/migration.c | 13 ++----------- 3 files changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); +extern int irq_do_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force); + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bb32326afe87..a1b903380bcf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -139,6 +139,25 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + int ret; + + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + + return ret; +} + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -149,14 +168,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } + ret = irq_do_set_affinity(data, mask, false); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -280,9 +292,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret, node = desc->irq_data.node; + int node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -308,13 +319,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(mask, nodemask)) cpumask_and(mask, mask, nodemask); } - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } + irq_do_set_affinity(&desc->irq_data, mask, false); return 0; } #else diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - int ret = chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - } + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) + irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); cpumask_clear(desc->pending_mask); } -- cgit v1.2.3 From ee74d13229fb606353ff56f4927fa93b37e95bbe Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 20:40:55 +0530 Subject: smpboot, idle: Optimize calls to smp_processor_id() in idle_threads_init() While trying to initialize idle threads for all cpus, idle_threads_init() calls smp_processor_id() in a loop, which is unnecessary. The intent is to initialize idle threads for all non-boot cpus. So just use a variable to note the boot cpu and use it in the loop. Signed-off-by: Srivatsa S. Bhat Cc: suresh.b.siddha@intel.com Cc: venki@google.com Cc: nikunj@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20120524151055.2549.64309.stgit@srivatsabhat.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..0f2162f808a7 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -52,10 +52,12 @@ static inline void idle_init(unsigned int cpu) */ void __init idle_threads_init(void) { - unsigned int cpu; + unsigned int cpu, boot_cpu; + + boot_cpu = smp_processor_id(); for_each_possible_cpu(cpu) { - if (cpu != smp_processor_id()) + if (cpu != boot_cpu) idle_init(cpu); } } -- cgit v1.2.3 From 4a70d2d9909b43ed88043b98cabe2c7fbd563021 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 20:41:00 +0530 Subject: smpboot, idle: Fix comment mismatch over idle_threads_init() The comment over idle_threads_init() really talks about the functionality of idle_init(). Move that comment to idle_init(), and add a suitable comment over idle_threads_init(). Signed-off-by: Srivatsa S. Bhat Cc: suresh.b.siddha@intel.com Cc: venki@google.com Cc: nikunj@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20120524151100.2549.66501.stgit@srivatsabhat.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/smpboot.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 0f2162f808a7..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) per_cpu(idle_threads, smp_processor_id()) = current; } +/** + * idle_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ static inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); @@ -45,10 +51,7 @@ static inline void idle_init(unsigned int cpu) } /** - * idle_thread_init - Initialize the idle thread for a cpu - * @cpu: The cpu for which the idle thread should be initialized - * - * Creates the thread if it does not exist. + * idle_threads_init - Initialize idle threads for all cpus */ void __init idle_threads_init(void) { -- cgit v1.2.3 From 5307c9556bc17e3cd26d4e94fc3b2565921834de Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 May 2012 12:20:58 +0200 Subject: tick: Add tick skew boot option Let the user decide whether power consumption or jitter is the more important consideration for their machines. Quoting removal commit af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867: "Historically, Linux has tried to make the regular timer tick on the various CPUs not happen at the same time, to avoid contention on xtime_lock. Nowadays, with the tickless kernel, this contention no longer happens since time keeping and updating are done differently. In addition, this skew is actually hurting power consumption in a measurable way on many-core systems." Problems: - Contrary to the above, systems do encounter contention on both xtime_lock and RCU structure locks when the tick is synchronized. - Moderate sized RT systems suffer intolerable jitter due to the tick being synchronized. - SGI reports the same for their large systems. - Fully utilized systems reap no power saving benefit from skew removal, but do suffer from resulting induced lock contention. - 0209f649 rcu: limit rcu_node leaf-level fanout This patch was born to combat lock contention which testing showed to have been _induced by_ skew removal. Skew the tick, contention disappeared virtually completely. Signed-off-by: Mike Galbraith Link: http://lkml.kernel.org/r/1336472458.21924.78.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4eddbb5ea9c5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -814,6 +814,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +833,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, @@ -910,3 +920,11 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); -- cgit v1.2.3 From e5400321a6f15ce0fe77c8455954f213ef7dcc54 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 9 May 2012 23:39:34 +0900 Subject: clockevents: Make clockevents_config() a global symbol Make clockevents_config() into a global symbol to allow it to be used by compiled-in clockevent drivers. This is needed by drivers that want to update the timer frequency after registration time. Signed-off-by: Magnus Damm Tested-by: Simon Horman Cc: arnd@arndb.de Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Cc: Magnus Damm Link: http://lkml.kernel.org/r/20120509143934.27521.46553.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; -- cgit v1.2.3 From 62cf20b32aee4ae889a2eb40fd41c0eab73de970 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 14:08:57 +0200 Subject: tick: Move skew_tick option into the HIGH_RES_TIMER section commit 5307c95 (tick: Add tick skew boot option) broke the !CONFIG_HIGH_RES_TIMERS build. Move the boot option parsing into the CONFIG_HIGH_RES_TIMERS section. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Mike Galbraith --- kernel/time/tick-sched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4eddbb5ea9c5..efd386667536 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -816,6 +816,14 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) static int sched_skew_tick; +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -920,11 +928,3 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } - -static int __init skew_tick(char *str) -{ - get_option(&str, &sched_skew_tick); - - return 0; -} -early_param("skew_tick", skew_tick); -- cgit v1.2.3 From fa980ca87d15bb8a1317853f257a505990f3ffde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 May 2012 08:24:39 -0700 Subject: cgroup: superblock can't be released with active dentries 48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal optional" allowed a css to linger after the associated cgroup is removed. As a css holds a reference on the cgroup's dentry, it means that cgroup dentries may linger for a while. cgroup_create() does grab an active reference on the superblock to prevent it from going away while there are !root cgroups; however, the reference is put from cgroup_diput() which is invoked on cgroup removal, so cgroup dentries which are removed but persisting due to lingering csses already have released their superblock active refs allowing superblock to be killed while those dentries are around. Given the right condition, this makes cgroup_kill_sb() call kill_litter_super() with dentries with non-zero d_count leading to BUG() in shrink_dcache_for_umount_subtree(). Fix it by adding cgroup_dops->d_release() operation and moving deactivate_super() to it. cgroup_diput() now marks dentry->d_fsdata with itself if superblock should be deactivated and cgroup_d_release() deactivates the superblock on dentry release. Signed-off-by: Tejun Heo Reported-by: Sasha Levin Tested-by: Sasha Levin LKML-Reference: Acked-by: Li Zefan --- kernel/cgroup.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad8eae5bb801..e887b55f1f29 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * Drop the active superblock reference that we took when we - * created the cgroup + * We want to drop the active superblock reference from the + * cgroup creation after all the dentry refs are gone - + * kill_sb gets mighty unhappy otherwise. Mark + * dentry->d_fsdata with cgroup_diput() to tell + * cgroup_d_release() to call deactivate_super(). */ - deactivate_super(cgrp->root->sb); + dentry->d_fsdata = cgroup_diput; /* * if we're getting rid of the cgroup, refcount should ensure @@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d) return 1; } +static void cgroup_d_release(struct dentry *dentry) +{ + /* did cgroup_diput() tell me to deactivate super? */ + if (dentry->d_fsdata == cgroup_diput) + deactivate_super(dentry->d_sb); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, + .d_release = cgroup_d_release, }; struct inode *inode = -- cgit v1.2.3 From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 17:15:29 +0200 Subject: sched/nohz: Fix rq->cpu_load calculations some more Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[] calculations") since while that fixed the busy case it regressed the mostly idle case. Add a callback from the nohz exit to also age the rq->cpu_load[] array. This closes the hole where either there was no nohz load balance pass during the nohz, or there was a 'significant' amount of idle time between the last nohz balance and the nohz exit. So we'll update unconditionally from the tick to not insert any accidental 0 load periods while busy, and we try and catch up from nohz idle balance and nohz exit. Both these are still prone to missing a jiffy, but that has always been the case. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Cc: Venkatesh Pallipadi Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 +++++++++++++++++++++++++++++++++++++++--------- kernel/time/tick-sched.c | 1 + 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..75844a8f9aeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,25 +2517,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2546,13 +2553,39 @@ void update_idle_cpu_load(struct rq *this_rq) __update_cpu_load(this_rq, load, pending_updates); } +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + /* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..0c927cd85345 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* -- cgit v1.2.3 From 2ea45800d8e1c3c51c45a233d6bd6289a297a386 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 May 2012 09:26:43 +0200 Subject: sched: Don't try allocating memory from offline nodes Allocators don't appreciate it when you try and allocate memory from offline nodes. Reported-and-tested-by: Tony Luck Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-epfc1io9whb7o22bcujf31vn@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75844a8f9aeb..55733616baaa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6436,7 +6436,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; -- cgit v1.2.3 From 74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 May 2012 18:00:43 +0200 Subject: sched: Fix SD_OVERLAP SD_OVERLAP exists to allow overlapping groups, overlapping groups appear in NUMA topologies that aren't fully connected. The typical result of not fully connected NUMA is that each cpu (or rather node) will have different spans for a particular distance. However due to how sched domains are traversed -- only the first cpu in the mask goes one level up -- the next level only cares about the spans of the cpus that went up. Due to this two things were observed to be broken: - build_overlap_sched_groups() -- since its possible the cpu we're building the groups for exists in multiple (or all) groups, the selection criteria of the first group didn't ensure there was a cpu for which is was true that cpumask_first(span) == cpu. Thus load- balancing would terminate. - update_group_power() -- assumed that the cpu span of the first group of the domain was covered by all groups of the child domain. The above explains why this isn't true, so deal with it. Signed-off-by: Peter Zijlstra Cc: David Rientjes Link: http://lkml.kernel.org/r/1337788843.9783.14.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++-- kernel/sched/fair.c | 25 ++++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55733616baaa..3a69374fb427 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6030,11 +6030,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); atomic_inc(&sg->sgp->ref); - if (cpumask_test_cpu(cpu, sg_span)) + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + cpumask_first(sg_span) == cpu) { + WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); groups = sg; + } if (!first) first = sg; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..f0380d4987b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3574,11 +3574,26 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } sdg->sgp->power = power; } -- cgit v1.2.3 From b654f7de41b0e3903ee2b51d3b8db77fe52ce728 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2012 14:04:28 +0200 Subject: sched: Make sure to not re-read variables after validation We could re-read rq->rt_avg after we validated it was smaller than total, invalidating the check and resulting in an unintended negative. Signed-off-by: Peter Zijlstra Cc: David Rientjes Link: http://lkml.kernel.org/r/1337688268.9698.29.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0380d4987b3..2b449a762074 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); + + total = sched_avg_period() + (rq->clock - age_stamp); - if (unlikely(total < rq->rt_avg)) { + if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } if (unlikely((s64)total < SCHED_POWER_SCALE)) -- cgit v1.2.3 From 29baa7478ba47d746e3625c91d3b2afbf46b4312 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Apr 2012 12:11:21 +0200 Subject: sched: Move nr_cpus_allowed out of 'struct sched_rt_entity' Since nr_cpus_allowed is used outside of sched/rt.c and wants to be used outside of there more, move it to a more natural site. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kr61f02y9brwzkh6x53pdptm@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 36 +++++++++++++++++++++--------------- 3 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a69374fb427..70cc36a6073f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5015,7 +5015,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b449a762074..b2a2d236f27b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..295da737b6fe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); -- cgit v1.2.3 From 454c79999f7eaedcdf4c15c449e43902980cbdf5 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Wed, 16 May 2012 21:34:23 -0700 Subject: sched/rt: Fix SCHED_RR across cgroups task_tick_rt() has an optimization to only reschedule SCHED_RR tasks if they were the only element on their rq. However, with cgroups a SCHED_RR task could be the only element on its per-cgroup rq but still be competing with other SCHED_RR tasks in its parent's cgroup. In this case, the SCHED_RR task in the child cgroup would never yield at the end of its timeslice. If the child cgroup rt_runtime_us was the same as the parent cgroup rt_runtime_us, the task in the parent cgroup would starve completely. Modify task_tick_rt() to check that the task is the only task on its rq, and that the each of the scheduling entities of its ancestors is also the only entity on its rq. Signed-off-by: Colin Cross Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1337229266-15798-1-git-send-email-ccross@android.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 295da737b6fe..2a4e8dffbd6b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1985,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -2002,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } -- cgit v1.2.3 From 1292531f6f27af909e713671dd9cc3bcab8114b7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:41:54 +0900 Subject: sched: Make sched_feat_names const The strings sched_feat_names are never changed. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29B2.9030904@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70cc36a6073f..c1679a098fc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,7 +142,7 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" NULL }; -- cgit v1.2.3 From 7997a456ef841bb78eb6f881d7cc2c17c2f9b35e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:42:47 +0900 Subject: sched: Remove the last NULL entry from sched_feat_names No need to have the last NULL entry. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29E7.5020805@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1679a098fc7..94d598ac5e64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -144,7 +144,6 @@ const_debug unsigned int sysctl_sched_features = static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT -- cgit v1.2.3 From 6a4c96eef42f835734a82c6b512abf9881b7c55d Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Wed, 23 May 2012 14:44:11 +0530 Subject: sched: Remove NULL assignment of dattr_cur Remove explicit NULL assignment of static pointer dattr_cur from init_sched_domains(). Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120523091411.GG5005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94d598ac5e64..c46958e26121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6726,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); -- cgit v1.2.3 From cb7225feec627e91d598198996429e9ee6804f8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:51:44 +0900 Subject: perf: Remove duplicate invocation on perf_event_for_each The @func callback was invoked twice for group leader when perf_event_for_each() called. It seems the commit 75f937f24bd9 ("perf_counter: Fix ctx->mutex vs counter ->mutex inversion") made the mistake during the change. Signed-off-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1338443506-25009-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf6931..f85c0154b333 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From fad0c66c4bb836d57a5f125ecd38bed653ca863a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 30 May 2012 10:54:57 -0700 Subject: timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC. Adjust wall_to_monotonic when NTP inserted a leapsecond. Reported-by: Richard Cochran Signed-off-by: John Stultz Tested-by: Richard Cochran Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..6f46a00a1e8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -1077,6 +1078,7 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } timekeeping_update(false); -- cgit v1.2.3 From 8feb8e896d77439146d2e2ab3d0ab55bb5baf5fc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 29 May 2012 15:16:05 +0800 Subject: smp: Remove ipi_call_lock[_irq]()/ipi_call_unlock[_irq]() There is no user of those APIs anymore, just remove it. Signed-off-by: Yong Zhang Cc: ralf@linux-mips.org Cc: sshtylyov@mvista.com Cc: david.daney@cavium.com Cc: nikunj@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: axboe@kernel.dk Cc: Andrew Morton Link: http://lkml.kernel.org/r/1338275765-3217-11-git-send-email-yong.zhang0@gmail.com Acked-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/smp.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b24875e..29dd40a9f2f4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - raw_spin_lock(&call_function.lock); -} - -void ipi_call_unlock(void) -{ - raw_spin_unlock(&call_function.lock); -} - -void ipi_call_lock_irq(void) -{ - raw_spin_lock_irq(&call_function.lock); -} - -void ipi_call_unlock_irq(void) -{ - raw_spin_unlock_irq(&call_function.lock); -} #endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ -- cgit v1.2.3 From ec44bc7acc3687ba6ae8154b4b5a845b70279237 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:57 +0000 Subject: timers: Create detach_if_pending() and use it Most callers of detach_timer() have the same pattern around them. Check whether the timer is pending and eventually updating base->next_timer. Create detach_if_pending() and replace the duplicated code. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.131246037@linutronix.de --- kernel/timer.c | 56 +++++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e0db43..0f70deb20151 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -654,8 +654,7 @@ void init_timer_deferrable_key(struct timer_list *timer, } EXPORT_SYMBOL(init_timer_deferrable_key); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -667,6 +666,19 @@ static inline void detach_timer(struct timer_list *timer, entry->prev = LIST_POISON2; } +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; + return 1; +} + /* * We are using hashed locking: holding per_cpu(tvec_bases).lock * means that all timers which are tied to this base via timer->base are @@ -712,16 +724,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 0); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } else { - if (pending_only) - goto out_unlock; - } + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; debug_activate(timer, expires); @@ -959,13 +964,7 @@ int del_timer(struct timer_list *timer) timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } + ret = detach_if_pending(timer, base, true); spin_unlock_irqrestore(&base->lock, flags); } @@ -990,19 +989,10 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer == timer) - goto out; - - timer_stats_timer_clear_start_info(timer); - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); } -out: spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1178,7 +1168,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, 1); + detach_timer(timer, true); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1714,7 +1704,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); + detach_timer(timer, false); timer_set_base(timer, new_base); if (time_before(timer->expires, new_base->next_timer) && !tbase_get_deferrable(timer->base)) -- cgit v1.2.3 From facbb4a7efbd658046bf615f03cd97a1504785d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:57 +0000 Subject: timers: Consolidate base->next_timer update Another bunch of mindlessly copied code. All callers of internal_add_timer() except the recascading code updates base->next_timer. Move this into internal_add_timer() and let the cascading code call __internal_add_timer(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.189946224@linutronix.de --- kernel/timer.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0f70deb20151..7207690b5353 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -330,7 +330,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; @@ -372,6 +373,17 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + /* + * Update base->next_timer if this is the earliest one. + */ + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; +} + #ifdef CONFIG_TIMER_STATS void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) { @@ -757,9 +769,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -925,9 +934,6 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_activate(timer, timer->expires); - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -1079,7 +1085,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); } return index; @@ -1706,9 +1713,6 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, false); timer_set_base(timer, new_base); - if (time_before(timer->expires, new_base->next_timer) && - !tbase_get_deferrable(timer->base)) - new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } -- cgit v1.2.3 From 99d5f3aac674fe081ffddd2dbb8946ccbc14c410 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:58 +0000 Subject: timers: Add accounting of non deferrable timers The code in get_next_timer_interrupt() is suboptimal as it has to run through the cascade to find the next expiring timer. On a completely idle core we should only do that when there is an active timer enqueued and base->next_timer does not give us a fast answer. Add accounting of the active timers to the now consolidated attach/detach code. I deliberately avoided sanity checks because the code is fully symetric and any fiddling with timers w/o using the API functions will lead to cute explosions anyway. ulong is big enough even on 32bit and if we really run into the situation to have more than 1<<32 timers enqueued there, then we are definitely not in a state to go idle and run through that code. This allows us to fix another shortcoming of get_next_timer_interrupt(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Gilad Ben-Yossef Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.236377028@linutronix.de --- kernel/timer.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 7207690b5353..7fada698bd1a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -77,6 +77,7 @@ struct tvec_base { struct timer_list *running_timer; unsigned long timer_jiffies; unsigned long next_timer; + unsigned long active_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -377,11 +378,13 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { __internal_add_timer(base, timer); /* - * Update base->next_timer if this is the earliest one. + * Update base->active_timers and base->next_timer */ - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; + if (!tbase_get_deferrable(timer->base)) { + if (time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + base->active_timers++; + } } #ifdef CONFIG_TIMER_STATS @@ -678,6 +681,14 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) entry->prev = LIST_POISON2; } +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + timer->base->active_timers--; +} + static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, bool clear_pending) { @@ -685,9 +696,11 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; + if (!tbase_get_deferrable(timer->base)) { + timer->base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } return 1; } @@ -1175,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, true); + detach_expired_timer(timer, base); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1701,6 +1714,7 @@ static int __cpuinit init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; + base->active_timers = 0; return 0; } @@ -1711,6 +1725,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); + /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer_set_base(timer, new_base); internal_add_timer(new_base, timer); -- cgit v1.2.3 From e40468a54882ef7411fb178dbf2e465ec2349af7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 22:08:59 +0000 Subject: timers: Improve get_next_timer_interrupt() Gilad reported at http://lkml.kernel.org/r/1336056962-10465-2-git-send-email-gilad@benyossef.com "Current timer code fails to correctly return a value meaning that there is no future timer event, with the result that the timer keeps getting re-armed in HZ one shot mode even when we could turn it off, generating unneeded interrupts. What is happening is that when __next_timer_interrupt() wishes to return a value that signifies "there is no future timer event", it returns (base->timer_jiffies + NEXT_TIMER_MAX_DELTA). However, the code in tick_nohz_stop_sched_tick(), which called __next_timer_interrupt() via get_next_timer_interrupt(), compares the return value to (last_jiffies + NEXT_TIMER_MAX_DELTA) to see if the timer needs to be re-armed. base->timer_jiffies != last_jiffies and so tick_nohz_stop_sched_tick() interperts the return value as indication that there is a distant future event 12 days from now and programs the timer to fire next after KTIME_MAX nsecs instead of avoiding to arm it. This ends up causing a needless interrupt once every KTIME_MAX nsecs." Fix this by using the new active timer accounting. This avoids scans when no active timer is enqueued completely, so we don't have to rely on base->timer_next and base->timer_jiffies anymore. Reported-by: Gilad Ben-Yossef Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120525214819.317535385@linutronix.de --- kernel/timer.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 7fada698bd1a..a61c09374eba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1326,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, unsigned long get_next_timer_interrupt(unsigned long now) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires; + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; /* * Pretend that there is no timer pending if the cpu is offline. * Possible pending timers will be migrated later to an active cpu. */ if (cpu_is_offline(smp_processor_id())) - return now + NEXT_TIMER_MAX_DELTA; + return expires; + spin_lock(&base->lock); - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } spin_unlock(&base->lock); if (time_before_eq(expires, now)) -- cgit v1.2.3 From 10717dcde10d09f9fcee53a12a4236af1a82b484 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 6 Jun 2012 14:52:51 +0800 Subject: sched/numa: Load balance between remote nodes Commit cb83b629b ("sched/numa: Rewrite the CONFIG_NUMA sched domain support") removed the NODE sched domain and started checking if the node distance in SLIT table is farther than REMOTE_DISTANCE, if so, it will lose the load balance chance at exec/fork/wake_affine points. But actually, even the node distance is farther than REMOTE_DISTANCE. Modern CPUs also has QPI like connections, which ensures that memory access is not too slow between nodes. So the above change in behavior on NUMA machine causes a performance regression on various benchmarks: hackbench, tbench, netperf, oltp, etc. This patch will recover the scheduler behavior to old mode on all my Intel platforms: NHM EP/EX, WSM EP, SNB EP/EP4S, and thus fixes the perfromance regressions. (all of them just have 2 kinds distance, 10, 21) Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338965571-9812-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46958e26121..6546083af3e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6321,7 +6321,7 @@ static int sched_domains_curr_level; static inline int sd_local_flags(int level) { - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) return 0; return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -- cgit v1.2.3 From 7f1b43936f0ecad14770634c021cf4a929aec74d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 21:19:46 +0200 Subject: sched/rt: Fix lockdep annotation within find_lock_lowest_rq() Roland Dreier reported spurious, hard to trigger lockdep warnings within the scheduler - without any real lockup. This bit gives us the right clue: > [89945.640512] [] double_lock_balance+0x5a/0x90 > [89945.640568] [] push_rt_task+0xc6/0x290 if you look at that code you'll find the double_lock_balance() in question is the one in find_lock_lowest_rq() [yay for inlining]. Now find_lock_lowest_rq() has a bug.. it fails to use double_unlock_balance() in one exit path, if this results in a retry in push_rt_task() we'll call double_lock_balance() again, at which point we'll run into said lockdep confusion. Reported-by: Roland Dreier Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1337282386.4281.77.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2a4e8dffbd6b..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } -- cgit v1.2.3 From c1174876874dcf8986806e4dad3d7d07af20b439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 14:47:33 +0200 Subject: sched: Fix domain iteration Weird topologies can lead to asymmetric domain setups. This needs further consideration since these setups are typically non-minimal too. For now, make it work by adding an extra mask selecting which CPUs are allowed to iterate up. The topology that triggered it is the one from David Rientjes: 10 20 20 30 20 10 20 20 20 20 10 20 30 20 20 10 resulting in boxes that wouldn't even boot. Reported-by: David Rientjes Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 2 ++ 3 files changed, 61 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6546083af3e0..781acb91a50a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); sg->sgp = *per_cpu_ptr(sdd->sgp, i); - atomic_inc(&sg->sgp->ref); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - cpumask_first(sg_span) == cpu) { - WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); + group_balance_cpu(sg) == cpu) groups = sg; - } if (!first) first = sg; @@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..54cbaa4e7b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From c3decf0dfbc95736b7c0ab68fa4e5854c4734da9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 12:05:32 +0200 Subject: sched: Always initialize cpu-power Often when we run into mis-shapen topologies the balance iteration fails to update the cpu power properly and we'll end up in /0 traps. Always initialize the cpu-power to a semi-sane value so that we can at least boot the machine, even if the load-balancer might not function correctly. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3lbhyj25sr169ha7z3qht5na@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++++++++- kernel/sched/fair.c | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 781acb91a50a..725ee7c1c8cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5604,7 +5604,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6075,6 +6080,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (atomic_inc_return(&sg->sgp->ref) == 1) build_group_mask(sd, sg); + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54cbaa4e7b37..c9fd6d673d05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu) } while (group != child->groups); } - sdg->sgp->power = power; + sdg->sgp->power_orig = sdg->sgp->power = power; } /* -- cgit v1.2.3 From d039ac60800fe8ed8522ec3b9ca796aaf748c18b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 21:20:16 +0200 Subject: sched: Validate assumptions in sched_init_numa() Add some code to validate assumptions we're making and output warnings if they are not. If this trigger we want to know about it. Signed-off-by: Peter Zijlstra Cc: Alex Shi Link: http://lkml.kernel.org/n/tip-6uc3wk5s9udxtdl9cnku0vtt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 99 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 725ee7c1c8cf..2bdd17616437 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5657,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5678,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -6373,7 +6382,6 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol #ifdef CONFIG_NUMA static int sched_domains_numa_levels; -static int sched_domains_numa_scale; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; @@ -6438,6 +6446,42 @@ static const struct cpumask *sd_numa_mask(int cpu) return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; } +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6445,7 +6489,6 @@ static void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_scale = curr_distance; sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -6456,23 +6499,41 @@ static void sched_init_numa(void) * * Assumes node_distance(0,j) includes all distances in * node_distance(i,j) in order to avoid cubic time. - * - * XXX: could be optimized to O(n log n) by using sort() */ next_distance = curr_distance; for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - int distance = node_distance(0, j); - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; } /* * 'level' contains the number of unique distances, excluding the -- cgit v1.2.3 From a841f8cef4bb124f0f5563314d0beaf2e1249d72 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 5 Jun 2012 13:44:36 -0500 Subject: sched: Fix the relax_domain_level boot parameter It does not get processed because sched_domain_level_max is 0 at the time that setup_relax_domain_level() is run. Simply accept the value as it is, as we don't know the value of sched_domain_level_max until sched domain construction is completed. Fix sched_relax_domain_level in cpuset. The build_sched_domain() routine calls the set_domain_attribute() routine prior to setting the sd->level, however, the set_domain_attribute() routine relies on the sd->level to decide whether idle load balancing will be off/on. Signed-off-by: Dimitri Sivanich Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120605184436.GA15668@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2bdd17616437..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6268,11 +6268,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6698,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -6706,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } -- cgit v1.2.3 From c00b275043adc14d668f36266b890f0c53d46640 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:44 +0200 Subject: uprobes: Optimize is_swbp_at_addr() for current->mm Change is_swbp_at_addr() to try to avoid the costly read_opcode() if mm == current->mm, __copy_from_user_inatomic() should succeed in the likely case. Currently this optimization is not important, but we are going to add more is_swbp_at_addr(current->mm) callers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192744.GA8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..d0f5ec0dcdea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -333,10 +333,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (current->mm == mm) { + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + } + result = read_opcode(mm, vaddr, &opcode); if (result) return result; - +out: if (is_swbp_insn(&opcode)) return 1; -- cgit v1.2.3 From a3d7bb47937b3a40b9f0c75655e97b3bb6407cbe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:59 +0200 Subject: uprobes: Change read_opcode() to use FOLL_FORCE set_orig_insn()->read_opcode() should not fail if the probed task did mprotect() after uprobe_register(), change it to use FOLL_FORCE. Without FOLL_WRITE this doesn't have any "side" effect but allows to read the !VM_READ memory. There is another reason for this change, we are going to use is_swbp_at_addr() from handle_swbp() which can race with another thread doing mprotect(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192759.GB8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d0f5ec0dcdea..a0dbc87a2ec6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -312,7 +312,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ void *vaddr_new; int ret; - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (ret <= 0) return ret; -- cgit v1.2.3 From 3a9ea0520f38def4a3915b91f82455b749f07d88 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:28:57 +0200 Subject: uprobes: Introduce find_active_uprobe() helper No functional changes. Move the "find uprobe" code from handle_swbp() to the new helper, find_active_uprobe(). Note: with or without this change, the find-active-uprobe logic is not exactly right. We can race with another thread which unmaps the memory with the valid uprobe before we take mm->mmap_sem. We can't find this uprobe simply because find_vma() fails. In this case we wrongly assume that this trap was not caused by uprobe and send the erroneous SIGTRAP. See the next changes. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192857.GC8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a0dbc87a2ec6..eaf4d55fd424 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,38 +1489,47 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } -/* - * Run handler and ask thread to singlestep. - * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. - */ -static void handle_swbp(struct pt_regs *regs) +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) { + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - struct uprobe_task *utask; - struct uprobe *uprobe; - struct mm_struct *mm; - unsigned long bp_vaddr; - uprobe = NULL; - bp_vaddr = uprobe_get_swbp_addr(regs); - mm = current->mm; down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; + if (vma && vma->vm_start <= bp_vaddr) { + if (valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); - uprobe = find_uprobe(inode, offset); + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } } srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); current->uprobe_srcu_id = -1; up_read(&mm->mmap_sem); + return uprobe; +} + +/* + * Run handler and ask thread to singlestep. + * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. + */ +static void handle_swbp(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct uprobe *uprobe; + unsigned long bp_vaddr; + + bp_vaddr = uprobe_get_swbp_addr(regs); + uprobe = find_active_uprobe(bp_vaddr); + if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ send_sig(SIGTRAP, current, 0); -- cgit v1.2.3 From d790d34653ab20c74034902f5f0889bba807949a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:14 +0200 Subject: uprobes: Teach find_active_uprobe() to provide the "is_swbp" info A separate patch to simplify the review, and for the documentation. The patch adds another "int *is_swbp" argument to find_active_uprobe(), so far its only caller doesn't use this info. With this patch find_active_uprobe() additionally does: - if find_vma() + ->vm_start check fails, *is_swbp = -EFAULT - otherwise, if valid_vma() + find_uprobe() fails, it holds the result of is_swbp_at_addr(), can be negative too. The latter is only possible if we raced with another thread which did munmap/etc after we hit this bp. IOW. If find_active_uprobe(&is_swbp) returns NULL, the caller can look at is_swbp to figure out whether the current insn is bp or not, or detect the race with another thread if it is negative. Note: I think that performance-wise this change is fine. This adds is_swbp_at_addr(), but only if we raced with uprobe_unregister() or if we hit the "normal" int3 but this mm has uprobes as well. And even in this case the slow read_opcode() path is very unlikely, this insn recently triggered do_int3(), __copy_from_user_inatomic() shouldn't fail in the likely case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192914.GD8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eaf4d55fd424..ee3df704e78a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,7 +1489,7 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -1497,7 +1497,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { struct inode *inode; @@ -1508,6 +1507,11 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) offset += (vma->vm_pgoff << PAGE_SHIFT); uprobe = find_uprobe(inode, offset); } + + if (!uprobe) + *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + } else { + *is_swbp = -EFAULT; } srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); @@ -1526,9 +1530,10 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr); + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ -- cgit v1.2.3 From 77fc4af1b59d12ab3b1467adf0a5204806853123 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:28 +0200 Subject: uprobes: Change register_for_each_vma() to take mm->mmap_sem for writing Change register_for_each_vma() to take mm->mmap_sem for writing. This is a bit unfortunate but hopefully not too bad, this is the slow path anyway. This is needed to ensure that find_active_uprobe() can not race with uprobe_register() which adds the new bp at the same bp_vaddr, after find_uprobe() fails and before is_swbp_at_addr_fast() checks the memory. IOW, this is needed to ensure that if find_active_uprobe() returns NULL but is_swbp == true, we can safely assume that it was the "normal" int3 and we should send SIGTRAP. There is another reason for this change. We are going to replace uprobes_state->count with MMF_ flags set by register/unregister and cleared by find_active_uprobe(), and set/clear shouldn't race with each other. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192928.GE8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ee3df704e78a..a2ed82b4808c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -853,12 +853,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) } mm = vi->mm; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, (unsigned long)vi->vaddr); if (!vma || !valid_vma(vma, is_register)) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -867,7 +867,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr != vi->vaddr) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -877,7 +877,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) else remove_breakpoint(uprobe, mm, vi->vaddr); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); if (is_register) { if (ret && ret == -EEXIST) -- cgit v1.2.3 From 56bb4cf6475d702d2fb00fc641aa6441097c0330 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:47 +0200 Subject: uprobes: Teach handle_swbp() to rely on "is_swbp" rather than uprobes_srcu Currently handle_swbp() assumes that it can't race with unregister, so it roughly does: if (find_uprobe(vaddr)) process_uprobe(); else send_sig(SIGTRAP); This relies on the not-really-working uprobes_srcu code we are going to remove, see the next patch. With this patch we rely on the result of is_swbp_at_addr(bp_vaddr) if find_uprobe() fails. If is_swbp == 1, then we hit the normal int3, we should send SIGTRAP. If is_swbp == 0, we raced with uprobe_unregister(), we simply restart this insn again. The "difficult" case is is_swbp == -EFAULT, when we can't read this memory. In this case I think we should restart too, and this is more correct compared to the current code which sends SIGTRAP. Ignoring ENOMEM/etc from get_user_pages(), this can only happen if another thread unmaps this memory before find_active_uprobe() takes mmap_sem. It would be better to pretend it was unmapped before this insn was executed, restart, and get SIGSEGV. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192947.GF8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2ed82b4808c..1f02e3bbfc1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1530,14 +1530,26 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { - /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + if (is_swbp > 0) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + } else { + /* + * Either we raced with uprobe_unregister() or we can't + * access this memory. The latter is only possible if + * another thread plays with our ->mm. In both cases + * we can simply restart. If this vma was unmapped we + * can pretend this insn was not executed yet and get + * the (correct) SIGSEGV after restart. + */ + instruction_pointer_set(regs, bp_vaddr); + } return; } -- cgit v1.2.3 From 778b032d96909690c19d84f8d17c13be65ed6f8e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:30:08 +0200 Subject: uprobes: Kill uprobes_srcu/uprobe_srcu_id Kill the no longer needed uprobes_srcu/uprobe_srcu_id code. It doesn't really work anyway. synchronize_srcu() can only synchronize with the code "inside" the srcu_read_lock/srcu_read_unlock section, while uprobe_pre_sstep_notifier() does srcu_read_lock() _after_ we already hit the breakpoint. I guess this probably works "in practice". synchronize_srcu() is slow and it implies synchronize_sched(), and the probed task enters the non- preemptible section at the start of exception handler. Still this is not right at least in theory, and task->uprobe_srcu_id blows task_struct. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529193008.GG8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1f02e3bbfc1d..8c5e043cd309 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -38,7 +38,6 @@ #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE -static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -738,20 +737,14 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) } /* - * There could be threads that have hit the breakpoint and are entering the - * notifier code and trying to acquire the uprobes_treelock. The thread - * calling delete_uprobe() that is removing the uprobe from the rb_tree can - * race with these threads and might acquire the uprobes_treelock compared - * to some of the breakpoint hit threads. In such a case, the breakpoint - * hit threads will not find the uprobe. The current unregistering thread - * waits till all other threads have hit a breakpoint, to acquire the - * uprobes_treelock before the uprobe is removed from the rbtree. + * There could be threads that have already hit the breakpoint. They + * will recheck the current insn and restart if find_uprobe() fails. + * See find_active_uprobe(). */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; - synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -1388,9 +1381,6 @@ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - if (t->uprobe_srcu_id != -1) - srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); - if (!utask) return; @@ -1408,7 +1398,6 @@ void uprobe_free_utask(struct task_struct *t) void uprobe_copy_process(struct task_struct *t) { t->utask = NULL; - t->uprobe_srcu_id = -1; } /* @@ -1513,9 +1502,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; up_read(&mm->mmap_sem); return uprobe; @@ -1656,7 +1642,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) utask->state = UTASK_BP_HIT; set_thread_flag(TIF_UPROBE); - current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); return 1; } @@ -1691,7 +1676,6 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - init_srcu_struct(&uprobes_srcu); return register_die_notifier(&uprobe_exception_nb); } -- cgit v1.2.3 From 967db0ea65b0bf8507a7643ac8f296c4f2c0a834 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Wed, 6 Jun 2012 18:51:35 -0700 Subject: cgroup: make sure that decisions in __css_put are atomic __css_put is using atomic_dec on the ref count, and then looking at the ref count to make decisions. This is prone to races, as someone else may decrement ref count between our decrement and our decision. Instead, we should base our decisions on the value that we decremented the ref count to. (This results in an actual race on Google's kernel which I haven't been able to reproduce on the upstream kernel. Having said that, it's still incorrect by inspection). Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd3069a90..ceeafe874b3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4984,8 +4984,7 @@ void __css_put(struct cgroup_subsys_state *css) struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - atomic_dec(&css->refcnt); - switch (css_refcnt(css)) { + switch (atomic_dec_return(&css->refcnt)) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.3 From 6be96a5c905178637ec06a5d456a76b2b304fca3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 Jun 2012 19:12:30 -0700 Subject: cgroup: remove hierarchy_mutex It was introduced for memcg to iterate cgroup hierarchy without holding cgroup_mutex, but soon after that it was replaced with a lockless way in memcg. No one used hierarchy_mutex since that, so remove it. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ceeafe874b3f..dec62f5936ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1073,28 +1073,24 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); - mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); - mutex_unlock(&ss->hierarchy_mutex); /* subsystem is now free - drop reference on module */ module_put(ss->module); } else if (bit & final_bits) { @@ -3917,37 +3913,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } -static void cgroup_lock_hierarchy(struct cgroupfs_root *root) -{ - /* We need to take each hierarchy_mutex in a consistent order */ - int i; - - /* - * No worry about a race with rebind_subsystems that might mess up the - * locking order, since both parties are under cgroup_mutex. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_lock(&ss->hierarchy_mutex); - } -} - -static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) -{ - int i; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_unlock(&ss->hierarchy_mutex); - } -} - /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4008,9 +3973,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_clone(cgrp); } - cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); - cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4037,9 +4000,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); - cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: @@ -4247,10 +4208,8 @@ again: list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ list_del_init(&cgrp->sibling); - cgroup_unlock_hierarchy(cgrp->root); list_del_init(&cgrp->allcg_node); @@ -4324,8 +4283,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* this function shouldn't be used with modular subsystems, since they @@ -4452,8 +4409,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* success! */ -- cgit v1.2.3 From f2bf1f6f5f89d031245067512449fc889b2f4bb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jun 2012 19:50:40 -0400 Subject: tracing: Have tracing_off() actually turn tracing off A recent update to have tracing_on/off() only affect the ftrace ring buffers instead of all ring buffers had a cut and paste error. The tracing_off() did the exact same thing as tracing_on() and would not actually turn off tracing. Unfortunately, tracing_off() is more important to be working than tracing_on() as this is a key development tool, as it lets the developer turn off tracing as soon as a problem is discovered. It is also used by panic and oops code. This bug also breaks the 'echo func:traceoff > set_ftrace_filter' Cc: # 3.4 Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..49249c28690d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); void tracing_off(void) { if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + ring_buffer_record_off(global_trace.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race -- cgit v1.2.3 From 8f5af6f1f2d09fe5eac86a5dc1731a5917c1503a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 May 2012 08:31:53 -0700 Subject: rcu: RCU_FAST_NO_HZ detection of callback adoption In the present implementations of CPU hotplug, the outgoing CPU is guaranteed to run its stop-machine process on the way out, which will guarantee that RCU_FAST_NO_HZ forces the CPU out of dyntick-idle mode. However, new versions of CPU hotplug might not work this way. This commit therefore removes this design constraint by explicitly notifying CPUs when they adopt non-lazy RCU callbacks. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..3b0f1337f75b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); rsp->qlen_lazy = 0; rsp->qlen = 0; -- cgit v1.2.3 From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 May 2012 19:10:35 -0700 Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks In the current code, a short dyntick-idle interval (where there is at least one non-lazy callback on the CPU) and a long dyntick-idle interval (where there are only lazy callbacks on the CPU) are traced identically, which can be less than helpful. This commit therefore emits different event traces in these two cases. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree_plugin.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..5449f02c4820 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu) !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_GP_DELAY; - else + } else { per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } tp = &per_cpu(rcu_idle_gp_timer, cpu); mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = -- cgit v1.2.3 From 5955f7eecd77d6b440db278b266cfecdb72ecd00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 12:07:05 -0700 Subject: rcu: Move RCU_FAST_NO_HZ per-CPU variables to rcu_dynticks structure The RCU_FAST_NO_HZ code relies on a number of per-CPU variables. This works, but is hidden from someone scanning the data structures in rcutree.h. This commit therefore converts these per-CPU variables to fields in the per-CPU rcu_dynticks structures. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.h | 14 +++++++ kernel/rcutree_plugin.h | 99 ++++++++++++++++++++++--------------------------- 2 files changed, 58 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..ea056495783e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,6 +84,20 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + int dyntick_drain; /* Prepare-for-idle state variable. */ + unsigned long dyntick_holdoff; + /* No retries for the jiffy of failure. */ + struct timer_list idle_gp_timer; + /* Wake up CPU sleeping with callbacks. */ + unsigned long idle_gp_timer_expires; + /* When to wake up CPU (for repost). */ + bool idle_first_pass; /* First pass of attempt to go idle? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5449f02c4820..6bd9637d5d83 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1962,21 +1962,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no * callbacks on this CPU, (2) this CPU has not yet attempted to enter @@ -1988,13 +1973,15 @@ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); */ int rcu_needs_cpu(int cpu) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + /* Flag a new idle sojourn to the idle-entry state machine. */ - per_cpu(rcu_idle_first_pass, cpu) = 1; + rdtp->idle_first_pass = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; + return rdtp->dyntick_holdoff == jiffies; } /* @@ -2075,21 +2062,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, cpu); - per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; - per_cpu(rcu_idle_first_pass, cpu) = 1; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dyntick_holdoff = jiffies - 1; + setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); + rdtp->idle_gp_timer_expires = jiffies - 1; + rdtp->idle_first_pass = 1; } /* * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * is no longer any point to ->idle_gp_timer, so cancel it. This will * do nothing if this timer is not active, so just cancel it unconditionally. */ static void rcu_cleanup_after_idle(int cpu) { - del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2108,42 +2098,41 @@ static void rcu_cleanup_after_idle(int cpu) * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later. The ->dyntick_drain field controls the sequencing. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle * loop, then don't take any state-machine actions, unless the * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * Instead, repost the ->idle_gp_timer if this CPU has callbacks * pending. */ - if (!per_cpu(rcu_idle_first_pass, cpu) && - (per_cpu(rcu_nonlazy_posted, cpu) == - per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (!rdtp->idle_first_pass && + (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { if (rcu_cpu_has_callbacks(cpu)) { - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); } return; } - per_cpu(rcu_idle_first_pass, cpu) = 0; - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu) - 1; + rdtp->idle_first_pass = 0; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; + rdtp->dyntick_holdoff = jiffies - 1; + rdtp->dyntick_drain = 0; trace_rcu_prep_idle("No callbacks"); return; } @@ -2152,38 +2141,37 @@ static void rcu_prepare_for_idle(int cpu) * If in holdoff mode, just return. We will presumably have * refrained from disabling the scheduling-clock tick. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + if (rdtp->dyntick_holdoff == jiffies) { trace_rcu_prep_idle("In holdoff"); return; } - /* Check and update the rcu_dyntick_drain sequencing. */ - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Check and update the ->dyntick_drain sequencing. */ + if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = RCU_IDLE_FLUSHES; + } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_drain = 0; + rdtp->dyntick_holdoff = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_GP_DELAY; } else { - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_LAZY_GP_DELAY; trace_rcu_prep_idle("Dyntick with lazy callbacks"); } - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; return; /* Nothing more to do immediately. */ - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + } else if (--(rdtp->dyntick_drain) <= 0) { /* We have hit the limit, so time to give up. */ - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_holdoff = jiffies; trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2229,7 +2217,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_nonlazy_posted, 1); + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2240,11 +2228,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct timer_list *tltp = &rdtp->idle_gp_timer; sprintf(cp, "drain=%d %c timer=%lu", - per_cpu(rcu_dyntick_drain, cpu), - per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + rdtp->dyntick_drain, + rdtp->dyntick_holdoff == jiffies ? 'H' : '.', timer_pending(tltp) ? tltp->expires - jiffies : -1); } -- cgit v1.2.3 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree_plugin.h | 66 +++++++++++++++++++++++++++++++----------------- kernel/time/tick-sched.c | 7 ++++- 2 files changed, 49 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6bd9637d5d83..5271a020887e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1962,28 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return rdtp->dyntick_holdoff == jiffies; -} - /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2026,6 +2005,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) rcu_preempt_cpu_has_nonlazy_callbacks(cpu); } +/* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + /* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..52f5ebbd443b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.2.3 From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file"). After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until final mmput(), it never becomes NULL while task is alive. We can check for other mapped files in mm instead of checking mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in order to forbid second changing of mm->exe_file. Signed-off-by: Konstantin Khlebnikov Reviewed-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657a..54f20fdee93c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma, static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { + struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; - /* - * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's - * remain. So perform a quick test first. - */ - if (mm->num_exe_file_vmas) - return -EBUSY; - exe_file = fget(fd); if (!exe_file) return -EBADF; @@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if there are mapped other files. + */ + err = -EBUSY; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && !path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_unlock; + } + /* * The symlink can be changed only once, just to disallow arbitrary * transitions malicious software might bring in. This means one * could make a snapshot over all processes running and monitor * /proc/pid/exe changes to notice unusual activity if needed. */ - down_write(&mm->mmap_sem); - if (likely(!mm->exe_file)) - set_mm_exe_file(mm, exe_file); - else - err = -EBUSY; + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + set_mm_exe_file(mm, exe_file); +exit_unlock: up_write(&mm->mmap_sem); exit: -- cgit v1.2.3 From 1ad75b9e16280ca4e2501a629a225319cf2eef2e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: c/r: prctl: add minimal address test to PR_SET_MM Make sure the address being set is greater than mmap_min_addr (as suggested by Kees Cook). Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Serge Hallyn Cc: Tejun Heo Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 54f20fdee93c..19a2c7139960 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1869,7 +1869,7 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); - if (addr >= TASK_SIZE) + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; -- cgit v1.2.3 From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: c/r: prctl: add ability to get clear_tid_address Zero is written at clear_tid_address when the process exits. This functionality is used by pthread_join(). We already have sys_set_tid_address() to change this address for the current task but there is no way to obtain it from user space. Without the ability to find this address and dump it we can't restore pthread'ed apps which call pthread_join() once they have been restored. This patch introduces the PR_GET_TID_ADDRESS prctl option which allows the current process to obtain own clear_tid_address. This feature is available iif CONFIG_CHECKPOINT_RESTORE is set. [akpm@linux-foundation.org: fix prctl numbering] Signed-off-by: Andrew Vagin Signed-off-by: Cyrill Gorcunov Cc: Pedro Alves Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Tejun Heo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 19a2c7139960..0ec1942ba7ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1988,12 +1988,22 @@ out: up_read(&mm->mmap_sem); return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; default: return -EINVAL; } -- cgit v1.2.3 From 736f24d5e59d699c6e300c5da7e3bb882eddda67 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: c/r: prctl: drop VMA flags test on PR_SET_MM_ stack data assignment In commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") the stack allocated via clone() is marked in /proc//maps as [stack:%d] thus it might be out of the former mm->start_stack/end_stack values (and even has some custom VMA flags set). So to be able to restore mm->start_stack/end_stack drop vma flags test, but still require the underlying VMA to exist. As always note this feature is under CONFIG_CHECKPOINT_RESTORE and requires CAP_SYS_RESOURCE to be granted. Signed-off-by: Cyrill Gorcunov Cc: Oleg Nesterov Acked-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 0ec1942ba7ea..f0ec44dcd415 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1786,14 +1786,6 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE -static bool vma_flags_mismatch(struct vm_area_struct *vma, - unsigned long required, - unsigned long banned) -{ - return (vma->vm_flags & required) != required || - (vma->vm_flags & banned); -} - static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct vm_area_struct *vma; @@ -1931,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } -#ifdef CONFIG_STACK_GROWSUP - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) -#else - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) -#endif - goto out; if (opt == PR_SET_MM_START_STACK) mm->start_stack = addr; else if (opt == PR_SET_MM_ARG_START) -- cgit v1.2.3 From 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec mm->rss_stat counters have per-task delta: task->rss_stat. Before changing task->mm pointer the kernel must flush this delta with sync_mm_rss(). do_exit() already calls sync_mm_rss() to flush the rss-counters before committing the rss statistics into task->signal->maxrss, taskstats, audit and other stuff. Unfortunately the kernel does this before calling mm_release(), which can call put_user() for processing task->clear_child_tid. So at this point we can trigger page-faults and task->rss_stat becomes non-zero again. As a result mm->rss_stat becomes inconsistent and check_mm() will print something like this: | BUG: Bad rss-counter state mm:ffff88020813c380 idx:1 val:-1 | BUG: Bad rss-counter state mm:ffff88020813c380 idx:2 val:1 This patch moves sync_mm_rss() into mm_release(), and moves mm_release() out of do_exit() and calls it earlier. After mm_release() there should be no pagefaults. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Konstantin Khlebnikov Reported-by: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: [3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 13 ++++++++----- kernel/fork.c | 8 ++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..804fb6bb8161 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,6 +423,7 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ + mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -640,7 +641,6 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; - mm_release(tsk, mm); if (!mm) return; /* @@ -960,9 +960,13 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); + + /* Set exit_code before complete_vfork_done() in mm_release() */ + tsk->exit_code = code; + + /* Release mm and sync mm's RSS info before statistics gathering */ + mm_release(tsk, tsk->mm); + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -975,7 +979,6 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); - tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..0560781c6904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,6 +619,14 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } + + /* + * Final rss-counter synchronization. After this point there must be + * no pagefaults into this mm from the current context. Otherwise + * mm->rss_stat will be inconsistent. + */ + if (mm) + sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); -- cgit v1.2.3 From 48d212a2eecaca2e1875925837ad27b2f43f48a3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Jun 2012 17:54:07 -0700 Subject: Revert "mm: correctly synchronize rss-counters at exit/exec" This reverts commit 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94. It's horribly and utterly broken for at least the following reasons: - calling sync_mm_rss() from mmput() is fundamentally wrong, because there's absolutely no reason to believe that the task that does the mmput() always does it on its own VM. Example: fork, ptrace, /proc - you name it. - calling it *after* having done mmdrop() on it is doubly insane, since the mm struct may well be gone now. - testing mm against NULL before you call it is insane too, since a NULL mm there would have caused oopses long before. .. and those are just the three bugs I found before I decided to give up looking for me and revert it asap. I should have caught it before I even took it, but I trusted Andrew too much. Cc: Konstantin Khlebnikov Cc: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 13 +++++-------- kernel/fork.c | 8 -------- 2 files changed, 5 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 804fb6bb8161..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,7 +423,6 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ - mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -641,6 +640,7 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; + mm_release(tsk, mm); if (!mm) return; /* @@ -960,13 +960,9 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - - /* Set exit_code before complete_vfork_done() in mm_release() */ - tsk->exit_code = code; - - /* Release mm and sync mm's RSS info before statistics gathering */ - mm_release(tsk, tsk->mm); - + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -979,6 +975,7 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0560781c6904..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,14 +619,6 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } - - /* - * Final rss-counter synchronization. After this point there must be - * no pagefaults into this mm from the current context. Otherwise - * mm->rss_stat will be inconsistent. - */ - if (mm) - sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); -- cgit v1.2.3 From 7eb9ba5ed312ec6ed9d22259c5da1acb7cf4bd29 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 8 Jun 2012 15:02:57 +0530 Subject: uprobes: Pass probed vaddr to arch_uprobe_analyze_insn() On RISC architectures like powerpc, instructions are fixed size. Instruction analysis on such platforms is just a matter of (insn % 4). Pass the vaddr at which the uprobe is to be inserted so that arch_uprobe_analyze_insn() can flag misaligned registration requests. Signed-off-by: Ananth N Mavinakaynahalli Cc: michael@ellerman.id.au Cc: antonb@thinktux.localdomain Cc: Paul Mackerras Cc: benh@kernel.crashing.org Cc: peterz@infradead.org Cc: Srikar Dronamraju Cc: Jim Keniston Cc: oleg@redhat.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20120608093257.GG13409@in.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c5e043cd309..b52376d02332 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -706,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) return ret; -- cgit v1.2.3 From cd96891d48a945ca2011fbeceda73813d6286195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Jun 2012 13:18:33 -0700 Subject: sched/fair: fix lots of kernel-doc warnings Fix lots of new kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3625): No description found for parameter 'env' Warning(kernel/sched/fair.c:3625): Excess function parameter 'sd' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3735): No description found for parameter 'env' Warning(kernel/sched/fair.c:3735): Excess function parameter 'sd' description in 'update_sd_pick_busiest' Warning(kernel/sched/fair.c:3735): Excess function parameter 'this_cpu' description in 'update_sd_pick_busiest' .. more warnings Signed-off-by: Randy Dunlap Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27b..d5583f9588e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3632,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. @@ -3741,11 +3741,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. @@ -3783,9 +3782,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -3874,10 +3871,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, * Returns 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { @@ -3903,9 +3898,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4048,11 +4042,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment. * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. -- cgit v1.2.3 From b871a42b6091b720e82ddff237659534c525c25b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 11 Jun 2012 15:07:08 +0200 Subject: smpboot: Remove leftover declaration Signed-off-by: Thomas Gleixner --- kernel/smpboot.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..6ef9433e1c70 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -3,8 +3,6 @@ struct task_struct; -int smpboot_prepare(unsigned int cpu); - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD struct task_struct *idle_thread_get(unsigned int cpu); void idle_thread_set_boot_cpu(void); -- cgit v1.2.3 From 19f5f7364a1cc770b14692f609bb9b802fc334d5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 Jul 2011 17:29:28 +0200 Subject: nohz: Separate idle sleeping time accounting from nohz logic As we plan to be able to stop the tick outside the idle task, we need to prepare for separating nohz logic from idle. As a start, this pulls the idle sleeping time accounting out of the tick stop/restart API to the callers on idle entry/exit. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 77 ++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6db496c..81409bba2425 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,10 +271,10 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, now; + ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; @@ -282,8 +282,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - now = tick_nohz_start_idle(cpu, ts); - /* * If this cpu is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by @@ -444,6 +442,14 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now; + + now = tick_nohz_start_idle(smp_processor_id(), ts); + tick_nohz_stop_sched_tick(ts, now); +} + /** * tick_nohz_idle_enter - stop the idle tick from the idle task * @@ -479,7 +485,7 @@ void tick_nohz_idle_enter(void) * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); local_irq_enable(); } @@ -499,7 +505,7 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); } /** @@ -540,39 +546,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } } -/** - * tick_nohz_idle_exit - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. - */ -void tick_nohz_idle_exit(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); #ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; #endif - ktime_t now; - - local_irq_disable(); - - WARN_ON_ONCE(!ts->inidle); - - ts->inidle = 0; - - if (ts->idle_active || ts->tick_stopped) - now = ktime_get(); - - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - - if (!ts->tick_stopped) { - local_irq_enable(); - return; - } - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); @@ -600,6 +578,35 @@ void tick_nohz_idle_exit(void) ts->idle_exittime = now; tick_nohz_restart(ts, now); +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + + if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, now); local_irq_enable(); } -- cgit v1.2.3 From 2ac0d98fd624ae50f5e6ae9c800977a9dbbfcde6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jul 2011 04:00:47 +0200 Subject: nohz: Make nohz API agnostic against idle ticks cputime accounting When the timer tick fires, it accounts the new jiffy as either part of system, user or idle time. This is how we record the cputime statistics. But when the tick is stopped from the idle task, we still need to record the number of jiffies spent tickless until we restart the tick and fall back to traditional tick-based cputime accounting. To do this, we take a snapshot of jiffies when the tick is stopped and compute the difference against the new value of jiffies when the tick is restarted. Then we account this whole difference to the idle cputime. However we are preparing to be able to stop the tick from other places than idle. So this idle time accounting needs to be performed from the callers of nohz APIs, not from the nohz APIs themselves because we now want them to be agnostic against places that stop/restart tick. Therefore, we pull the tickless idle time accounting out of generic nohz helpers up to idle entry/exit callers. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81409bba2425..911834b33b8a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -402,7 +402,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; } ts->idle_sleeps++; @@ -445,9 +444,13 @@ out: static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; + int was_stopped = ts->tick_stopped; now = tick_nohz_start_idle(smp_processor_id(), ts); tick_nohz_stop_sched_tick(ts, now); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; } /** @@ -548,15 +551,25 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ #ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -569,15 +582,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); #endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); } /** @@ -605,8 +609,10 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (ts->tick_stopped) + if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); + } local_irq_enable(); } @@ -811,7 +817,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - ts->idle_jiffies++; + if (idle_cpu(cpu)) + ts->idle_jiffies++; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); -- cgit v1.2.3 From f5d411c91ede162240f34e05a233f2759412988e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 31 Jul 2011 17:44:12 +0200 Subject: nohz: Rename ts->idle_tick to ts->last_tick Now that idle and nohz logics are going to be independant each others, ts->idle_tick becomes too much a biased name to describe the field that saves the last scheduled tick on top of which we re-calculate the next tick to schedule when the timer is restarted. We want to reuse this even to stop the tick outside idle cases. So let's rename it to some more generic name: ts->last_tick. This changes a bit the timer list stat export so we need to increase its version. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- kernel/time/timer_list.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 911834b33b8a..73cc4901336d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -400,7 +400,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) if (!ts->tick_stopped) { select_nohz_load_balancer(1); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; } @@ -526,7 +526,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.3 From 5b39939a40801f0c17e31adaf643d6e974227856 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 00:06:10 +0200 Subject: nohz: Move ts->idle_calls incrementation into strict idle logic Since we want to prepare for making the nohz API to work further the idle case, we need to pull ts->idle_calls incrementation up to the callers in idle. To perform this, we split tick_nohz_stop_sched_tick() in two parts: a first one that checks if we can really stop the tick for idle, and another that actually stops it. Then from the callers in idle, we check if we can stop the tick and only then we increment idle_calls and finally relay to the nohz API that won't care about these details anymore. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 86 ++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73cc4901336d..430e1b6901cc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,47 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; - int cpu; - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - - if (need_resched()) - return; - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); @@ -441,16 +409,56 @@ out: ts->sleep_length = ktime_sub(dev->next_event, now); } +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + return true; +} + static void __tick_nohz_idle_enter(struct tick_sched *ts) { ktime_t now; - int was_stopped = ts->tick_stopped; + int cpu = smp_processor_id(); - now = tick_nohz_start_idle(smp_processor_id(), ts); - tick_nohz_stop_sched_tick(ts, now); + now = tick_nohz_start_idle(cpu, ts); - if (!was_stopped && ts->tick_stopped) - ts->idle_jiffies = ts->last_jiffies; + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + tick_nohz_stop_sched_tick(ts, now, cpu); + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } } /** -- cgit v1.2.3 From 84bf1bccc60cc64376125ea2eae05e4ba12f795b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Aug 2011 01:25:38 +0200 Subject: nohz: Move next idle expiry time record into idle logic area The next idle expiry time record and idle sleeps tracking are statistics that only concern idle. Since we want the nohz APIs to become usable further idle context, let's pull up the handling of these statistics to the callers in idle. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 430e1b6901cc..60c9c60e9108 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -271,11 +271,11 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires; + ktime_t last_update, expires, ret = { .tv64 = 0 }; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -358,6 +358,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -372,11 +374,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->tick_stopped = 1; } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -407,6 +404,8 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) @@ -445,7 +444,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_enter(struct tick_sched *ts) { - ktime_t now; + ktime_t now, expires; int cpu = smp_processor_id(); now = tick_nohz_start_idle(cpu, ts); @@ -454,7 +453,12 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) int was_stopped = ts->tick_stopped; ts->idle_calls++; - tick_nohz_stop_sched_tick(ts, now, cpu); + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } if (!was_stopped && ts->tick_stopped) ts->idle_jiffies = ts->last_jiffies; -- cgit v1.2.3 From 6ebb017de9d59a18c3ff9648270e8f6abaa93438 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Jun 2012 08:52:34 +0200 Subject: printk: Fix alignment of buf causing crash on ARM EABI Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62, printk: convert byte-buffer to variable-length record buffer, causes systems using EABI to crash very early in the boot cycle. The first entry in struct log is a u64, which for EABI must be 8 byte aligned. Make use of __alignof__() so the compiler to decide the alignment, but allow it to be overridden using CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, for systems which can perform unaligned access and want to save a few bytes of space. Tested on Orion5x and Kirkwood. Signed-off-by: Andrew Lunn Tested-by: Stephen Warren Acked-by: Stephen Warren Acked-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..f205c25c37e2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -227,10 +227,10 @@ static u32 clear_idx; #define LOG_LINE_MAX 1024 /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); -- cgit v1.2.3 From 047fe3605235888f3ebcda0c728cb31937eadfe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Cc: stable # 2.6.35 Tested-by: Dave Jones Signed-off-by: Jens Axboe --- kernel/relay.c | 5 +++-- kernel/trace/trace.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, + .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, ret += padding; out: - splice_shrink_spd(pipe, &spd); - return ret; + splice_shrink_spd(&spd); + return ret; } static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..288488082224 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, @@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: return ret; } -- cgit v1.2.3 From 82ec90eac304e81b1389175b4dded7abecc678ef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 May 2012 18:51:11 -0700 Subject: resources: allow adjust_resource() for resources with no parent If a resource has no parent, allow its start/end to be set arbitrarily as long as any children are still contained within the new range. [bhelgaas: changelog] Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e1d2b8ee76d5..dc8b47764443 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -722,14 +722,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_lock(&resource_lock); + if (!parent) + goto skip; + if ((start < parent->start) || (end > parent->end)) goto out; - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - if (res->sibling && (res->sibling->start <= end)) goto out; @@ -741,6 +739,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t goto out; } +skip: + for (tmp = res->child; tmp; tmp = tmp->sibling) + if ((tmp->start < start) || (tmp->end > end)) + goto out; + res->start = start; res->end = end; result = 0; -- cgit v1.2.3 From a70270468234749741c5893ae78e5bb524771402 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 13 Jun 2012 09:35:48 -0400 Subject: watchdog: Quiet down the boot messages A bunch of bugzillas have complained how noisy the nmi_watchdog is during boot-up especially with its expected failure cases (like virt and bios resource contention). This is my attempt to quiet them down and keep it less confusing for the end user. What I did is print the message for cpu0 and save it for future comparisons. If future cpus have an identical message as cpu0, then don't print the redundant info. However, if a future cpu has a different message, happily print that loudly. Before the change, you would see something like: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog enabled, takes one hw-pmu counter. Booting Node 0, Processors #1 NMI watchdog enabled, takes one hw-pmu counter. #2 NMI watchdog enabled, takes one hw-pmu counter. #3 Ok. NMI watchdog enabled, takes one hw-pmu counter. Brought up 4 CPUs Total of 4 processors activated (22607.24 BogoMIPS). After the change, it is simplified to: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. Booting Node 0, Processors #1 #2 #3 Ok. Brought up 4 CPUs V2: little changes based on Joe Perches' feedback V3: printk cleanup based on Ingo's feedback; checkpatch fix V4: keep printk as one long line V5: Ingo fix ups Reported-and-tested-by: Nathan Zimmer Signed-off-by: Don Zickus Cc: nzimmer@sgi.com Cc: joe@perches.com Link: http://lkml.kernel.org/r/1339594548-17227-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -372,6 +372,13 @@ static int watchdog(void *unused) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + if (!IS_ERR(event)) { - pr_info("enabled, takes one hw-pmu counter.\n"); + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) -- cgit v1.2.3 From 8d240dd88cca33b704adf3fe281aa64b5aac2dd8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 29 Mar 2012 19:11:40 +0200 Subject: ftrace: Remove a superfluous check register_ftrace_function() checks ftrace_disabled and calls __register_ftrace_function which does it again. Drop the first check and add the unlikely hint to the second one. Also, drop the label as John correctly notices. No functional change. Link: http://lkml.kernel.org/r/20120329171140.GE6409@aftab Cc: Borislav Petkov Cc: John Kacur Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (ftrace_disabled) + if (unlikely(ftrace_disabled)) return -ENODEV; if (FTRACE_WARN_ON(ops == &global_ops)) @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; - ret = __register_ftrace_function(ops); if (!ret) ret = ftrace_startup(ops, 0); - - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); -- cgit v1.2.3 From 7374e82771c6d5a9af2080be46f64a5826c7efb1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 31 May 2012 21:40:05 -0400 Subject: tracing: Register the ftrace internal events during early boot All trace events including ftrace internel events (like trace_printk and function tracing), register functions that describe how to print their output. The events may be recorded as soon as the ring buffer is allocated, but they are just raw binary in the buffer. The mapping of event ids to how to print them are held within a structure that is registered on system boot. If a crash happens in boot up before these functions are registered then their output (via ftrace_dump_on_oops) will be useless: Dumping ftrace buffer: --------------------------------- <...>-1 0.... 319705us : Unknown type 6 --------------------------------- This can be quite frustrating for a kernel developer trying to see what is going wrong. There's no reason to register them so late in the boot up process. They can be registered by early_initcall(). Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1325,4 +1325,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); -- cgit v1.2.3 From e2ae715d66bf4becfb85eb84b7150e23cf27df30 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 15 Jun 2012 14:07:51 +0200 Subject: kmsg - kmsg_dump() use iterator to receive log buffer content Provide an iterator to receive the log buffer content, and convert all kmsg_dump() users to it. The structured data in the kmsg buffer now contains binary data, which should no longer be copied verbatim to the kmsg_dump() users. The iterator should provide reliable access to the buffer data, and also supports proper log line-aware chunking of data while iterating. Signed-off-by: Kay Sievers Tested-by: Tony Luck Reported-by: Anton Vorontsov Tested-by: Anton Vorontsov Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 192 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f205c25c37e2..ceb4a2f775a1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -909,7 +909,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; while (seq < log_next_seq) { @@ -919,6 +919,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; while (len > size && seq < log_next_seq) { @@ -929,7 +931,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; @@ -2300,48 +2302,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + /* last entry */ + if (dumper->cur_seq >= log_next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, syslog, + line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); - s2 = log_buf + idx; - l2 = log_next_idx - idx; + l -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, syslog, + buf + l, size - l); + + idx = log_next(idx); + seq++; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif -- cgit v1.2.3 From ea131377148cdfe90641b42ae9aa5a6b3a4fa327 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:22 +0200 Subject: uprobes: Valid_vma() should reject VM_HUGETLB __replace_page() obviously can't work with the hugetlbfs mappings, uprobe_register() will likely crash the kernel. Change valid_vma() to check VM_HUGETLB as well. As for PageTransHuge() no need to worry, vma->vm_file != NULL. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154322.GA9561@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b52376d02332..f0d04530af63 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) + == (VM_READ|VM_EXEC)) return true; return false; -- cgit v1.2.3 From cc359d180fa9c25a4c1819f17e07a422d788353d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:25 +0200 Subject: uprobes: __copy_insn() should ensure a_ops->readpage != NULL __copy_insn() blindly calls read_mapping_page(), this will crash the kernel if ->readpage == NULL, add the necessary check. For example, hugetlbfs_aops->readpage is NULL. Perhaps we should change read_mapping_page() instead. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154325.GA9568@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0d04530af63..604930bf9c92 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -610,6 +610,9 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins if (!filp) return -EINVAL; + if (!mapping->a_ops->readpage) + return -EIO; + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); off1 = offset &= ~PAGE_MASK; -- cgit v1.2.3 From 5323ce71e4b4e1f188ebbc0cc7776885ea6c75fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:28 +0200 Subject: uprobes: Write_opcode()->__replace_page() can race with try_to_unmap() write_opcode() gets old_page via get_user_pages() and then calls __replace_page() which assumes that this old_page is still mapped after pte_offset_map_lock(). This is not true if this old_page was already try_to_unmap()'ed, and in this case everything __replace_page() does with old_page is wrong. Just for example, put_page() is not balanced. I think it is possible to teach __replace_page() to handle this unlikely case correctly, but this patch simply changes it to use page_check_address() and return -EAGAIN if it fails. The caller should notice this error code and retry. Note: write_opcode() asks for the cleanups, I'll try to do this in a separate patch. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154328.GA9571@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 604930bf9c92..3ccdb29ee8d6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -129,33 +129,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; unsigned long addr; - int err = -EFAULT; + spinlock_t *ptl; + pte_t *ptep; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) - goto out; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + return -EFAULT; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + return -EAGAIN; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -174,10 +158,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); - err = 0; -out: - return err; + return 0; } /** @@ -222,9 +204,10 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; + unsigned long pgoff; loff_t addr; int ret; - +retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) @@ -269,9 +252,9 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, memcpy(vaddr_new, vaddr_old, PAGE_SIZE); /* poke the new insn in, ASSUMES we don't cross page boundary */ - vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + pgoff = (vaddr & ~PAGE_MASK); + BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -291,6 +274,8 @@ unlock_out: put_out: put_page(old_page); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } -- cgit v1.2.3 From c1914a0936f79ed0236f670122e06e36e4d332ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:31 +0200 Subject: uprobes: Install_breakpoint() should fail if is_swbp_insn() == T install_breakpoint() returns -EEXIST if is_swbp_insn(orig_insn) == T, the caller treats this code as success. This is doubly wrong. The successful return should set UPROBE_COPY_INSN, but the real problem is that it shouldn't succeed. If the probed insn is int3 the application should get SIGTRAP, this won't happen with uprobe. Probably we can fix this, we can add the UPROBE_SHARED_BP flag and teach handle_swbp/set_orig_insn to handle this case correctly. But this needs some complications and we have other insns which can't be probed, lets make a simple fix for now. I think this needs a cleanup. UPROBE_COPY_INSN should die, copy_insn() should be called by alloc_uprobe(). arch_uprobe_analyze_insn() depends on ->mm (ia32_compat) but it is called only once. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154331.GA9578@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ccdb29ee8d6..ec78152e32e9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -693,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -EEXIST; + return -ENOTSUPP; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) -- cgit v1.2.3 From 268720903f87e0b84b161626c4447b81671b5d18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:33 +0200 Subject: uprobes: Rework register_for_each_vma() to make it O(n) Currently register_for_each_vma() is O(n ** 2) + O(n ** 3), every time find_next_vma_info() "restarts" the vma_prio_tree_foreach() loop and each iteration rechecks the whole try_list. This also means that try_list can grow "indefinitely" if register/unregister races with munmap/mmap activity even if the number of mapping is bounded at any time. With this patch register_for_each_vma() builds the list of mm/vaddr structures only once and does install_breakpoint() for each entry. We do not care about the new mappings which can be created after build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap() should do its work. Note that we do not allocate map_info under i_mmap_mutex, this can deadlock with page reclaim (but see the next patch). So we use 2 lists, "curr" which we are going to return, and "prev" which holds the already allocated memory. The main loop deques the entry from "prev" (initially it is empty), and if "prev" becomes empty again it counts the number of entries we need to pre-allocate outside of i_mmap_mutex. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154333.GA9581@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 199 +++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ec78152e32e9..4e0db3496d70 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); -/* - * Maintain a temporary per vma info that can be used to search if a vma - * has already been handled. This structure is introduced since extending - * vm_area_struct wasnt recommended. - */ -struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; -}; - struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -742,139 +731,123 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info * -__find_next_vma_info(struct address_space *mapping, struct list_head *head, - struct vma_info *vi, loff_t offset, bool is_register) +struct map_info { + struct map_info *next; + struct mm_struct *mm; + loff_t vaddr; +}; + +static inline struct map_info *free_map_info(struct map_info *info) { + struct map_info *next = info->next; + kfree(info); + return next; +} + +static struct map_info * +build_map_info(struct address_space *mapping, loff_t offset, bool is_register) +{ + unsigned long pgoff = offset >> PAGE_SHIFT; struct prio_tree_iter iter; struct vm_area_struct *vma; - struct vma_info *tmpvi; - unsigned long pgoff; - int existing_vma; - loff_t vaddr; - - pgoff = offset >> PAGE_SHIFT; + struct map_info *curr = NULL; + struct map_info *prev = NULL; + struct map_info *info; + int more = 0; + again: + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; - existing_vma = 0; - vaddr = vma_address(vma, offset); - - list_for_each_entry(tmpvi, head, probe_list) { - if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { - existing_vma = 1; - break; - } - } - - /* - * Another vma needs a probe to be installed. However skip - * installing the probe if the vma is about to be unlinked. - */ - if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { - vi->mm = vma->vm_mm; - vi->vaddr = vaddr; - list_add(&vi->probe_list, head); - - return vi; + if (!prev) { + more++; + continue; } - } - - return NULL; -} -/* - * Iterate in the rmap prio tree and find a vma where a probe has not - * yet been inserted. - */ -static struct vma_info * -find_next_vma_info(struct address_space *mapping, struct list_head *head, - loff_t offset, bool is_register) -{ - struct vma_info *vi, *retvi; + if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + continue; - vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); - if (!vi) - return ERR_PTR(-ENOMEM); + info = prev; + prev = prev->next; + info->next = curr; + curr = info; - mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + info->mm = vma->vm_mm; + info->vaddr = vma_address(vma, offset); + } mutex_unlock(&mapping->i_mmap_mutex); - if (!retvi) - kfree(vi); + if (!more) + goto out; + + prev = curr; + while (curr) { + mmput(curr->mm); + curr = curr->next; + } - return retvi; + do { + info = kmalloc(sizeof(struct map_info), GFP_KERNEL); + if (!info) { + curr = ERR_PTR(-ENOMEM); + goto out; + } + info->next = prev; + prev = info; + } while (--more); + + goto again; + out: + while (prev) + prev = free_map_info(prev); + return curr; } static int register_for_each_vma(struct uprobe *uprobe, bool is_register) { - struct list_head try_list; - struct vm_area_struct *vma; - struct address_space *mapping; - struct vma_info *vi, *tmpvi; - struct mm_struct *mm; - loff_t vaddr; - int ret; + struct map_info *info; + int err = 0; - mapping = uprobe->inode->i_mapping; - INIT_LIST_HEAD(&try_list); - - ret = 0; + info = build_map_info(uprobe->inode->i_mapping, + uprobe->offset, is_register); + if (IS_ERR(info)) + return PTR_ERR(info); - for (;;) { - vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); - if (!vi) - break; + while (info) { + struct mm_struct *mm = info->mm; + struct vm_area_struct *vma; + loff_t vaddr; - if (IS_ERR(vi)) { - ret = PTR_ERR(vi); - break; - } + if (err) + goto free; - mm = vi->mm; down_write(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)vi->vaddr); - if (!vma || !valid_vma(vma, is_register)) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vma = find_vma(mm, (unsigned long)info->vaddr); + if (!vma || !valid_vma(vma, is_register)) + goto unlock; + vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != vi->vaddr) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vaddr != info->vaddr) + goto unlock; - if (is_register) - ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); - else - remove_breakpoint(uprobe, mm, vi->vaddr); - - up_write(&mm->mmap_sem); - mmput(mm); if (is_register) { - if (ret && ret == -EEXIST) - ret = 0; - if (ret) - break; + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + if (err == -EEXIST) + err = 0; + } else { + remove_breakpoint(uprobe, mm, info->vaddr); } + unlock: + up_write(&mm->mmap_sem); + free: + mmput(mm); + info = free_map_info(info); } - list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { - list_del(&vi->probe_list); - kfree(vi); - } - - return ret; + return err; } static int __uprobe_register(struct uprobe *uprobe) -- cgit v1.2.3 From 7a5bfb66b07f22d2429db776da7bb8b57bfb5cff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:36 +0200 Subject: uprobes: Change build_map_info() to try kmalloc(GFP_NOWAIT) first build_map_info() doesn't allocate the memory under i_mmap_mutex to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT first, it should work in the likely case and thus we almost never need the pre-alloc-and-retry path. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154336.GA9588@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e0db3496d70..897417dbca8e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -761,6 +761,16 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) if (!valid_vma(vma, is_register)) continue; + if (!prev && !more) { + /* + * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * reclaim. This is optimistic, no harm done if it fails. + */ + prev = kmalloc(sizeof(struct map_info), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (prev) + prev->next = NULL; + } if (!prev) { more++; continue; -- cgit v1.2.3 From c5784de2b351fe871bb57487878f7fc7ec5b075c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jun 2012 17:43:39 +0200 Subject: uprobes: Document uprobe_register() vs uprobe_mmap() race Because the mind is treacherous and makes us forget we need to write stuff down. Signed-off-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154339.GA9591@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 897417dbca8e..2671d9ad49be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 +/* + * We need separate register/unregister and mmap/munmap lock hashes because + * of mmap_sem nesting. + * + * uprobe_register() needs to install probes on (potentially) all processes + * and thus needs to acquire multiple mmap_sems (consequtively, not + * concurrently), whereas uprobe_mmap() is called while holding mmap_sem + * for the particular process doing the mmap. + * + * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem + * because of lock order against i_mmap_mutex. This means there's a hole in + * the register vma iteration where a mmap() can happen. + * + * Thus uprobe_register() can race with uprobe_mmap() and we can try and + * install a probe where one is already installed. + */ + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; @@ -339,7 +356,9 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - + /* + * See the comment near uprobes_hash(). + */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -845,6 +864,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { err = install_breakpoint(uprobe, mm, vma, info->vaddr); + /* + * We can race against uprobe_mmap(), see the + * comment near uprobe_hash(). + */ if (err == -EEXIST) err = 0; } else { @@ -1054,8 +1077,10 @@ int uprobe_mmap(struct vm_area_struct *vma) } ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - - /* Ignore double add: */ + /* + * We can race against uprobe_register(), see the + * comment near uprobe_hash(). + */ if (ret == -EEXIST) { ret = 0; -- cgit v1.2.3 From d436615e60c386095dac4a9bf72b08868d2a7564 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:42 +0200 Subject: uprobes: Copy_insn() shouldn't depend on mm/vma/vaddr 1. copy_insn() doesn't need "addr", it can use uprobe->offset. Remove this argument. 2. Change copy_insn/__copy_insn to accept "struct file*" instead of vma. copy_insn() is called only once and mm/vma/vaddr are random, it shouldn't depend on them. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154342.GA9598@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2671d9ad49be..08ef566da763 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -591,10 +591,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) } static int -__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, unsigned long offset) { - struct file *filp = vma->vm_file; struct page *page; void *vaddr; unsigned long off1; @@ -625,15 +624,13 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins return 0; } -static int -copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping; unsigned long nbytes; int bytes; - addr &= ~PAGE_MASK; - nbytes = PAGE_SIZE - addr; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); mapping = uprobe->inode->i_mapping; /* Instruction at end of binary; copy only available bytes */ @@ -644,13 +641,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, + if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, bytes - nbytes, uprobe->offset + nbytes)) return -ENOMEM; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } /* @@ -696,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, addr = (unsigned long)vaddr; if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma, addr); + ret = copy_insn(uprobe, vma->vm_file); if (ret) return ret; -- cgit v1.2.3 From fc36f59565861af2e897225bc3782479a26c5d5a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:44 +0200 Subject: uprobes: Copy_insn() should not return -ENOMEM if __copy_insn() fails copy_insn() returns -ENOMEM if the first __copy_insn() fails, it should return the correct error code. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154344.GA9601@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 08ef566da763..2db1d94d7dfc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -641,10 +641,10 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes)) - return -ENOMEM; - + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; bytes = nbytes; } return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); -- cgit v1.2.3 From eb2bf57bee42c7565032f93adaa211e2c9fcc52c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:47 +0200 Subject: uprobes: No need to re-check vma_address() in write_opcode() write_opcode() is called by register_for_each_vma() and uprobe_mmap() paths. In both cases the caller has already verified this vaddr under mmap_sem, no need to re-check. Note also that this check is wrong anyway, we should not truncate loff_t returned by vma_address() if we do not trust this mapping. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154347.GA9604@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2db1d94d7dfc..14c71a2aadad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -211,7 +211,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; struct uprobe *uprobe; unsigned long pgoff; - loff_t addr; int ret; retry: /* Read the page with vaddr into memory */ @@ -235,10 +234,6 @@ retry: if (mapping != vma->vm_file->f_mapping) goto put_out; - addr = vma_address(vma, uprobe->offset); - if (vaddr != (unsigned long)addr) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) -- cgit v1.2.3 From d9c4a30e82614d43b55893a73f31e7284007ce82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:50 +0200 Subject: uprobes: Move BUG_ON(UPROBE_SWBP_INSN_SIZE) from write_opcode() to install_breakpoint() write_opcode() ensures that UPROBE_SWBP_INSN doesn't cross the page boundary. This looks a bit confusing, the check does not depend on vaddr and it is enough to do it only once right after install_breakpoint()->arch_uprobe_analyze_insn(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154350.GA9611@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 14c71a2aadad..b9c61bda9029 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -210,7 +210,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; - unsigned long pgoff; int ret; retry: /* Read the page with vaddr into memory */ @@ -251,11 +250,7 @@ retry: vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - - /* poke the new insn in, ASSUMES we don't cross page boundary */ - pgoff = (vaddr & ~PAGE_MASK); - BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); + memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -699,6 +694,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (ret) return ret; + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + uprobe->flags |= UPROBE_COPY_INSN; } -- cgit v1.2.3 From 449d0d7c9fb87277175db34c009c70cb348004a8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:53 +0200 Subject: uprobes: Simplify the usage of uprobe->pending_list uprobe->pending_list is only used to create the temporary list, it has no meaning after we drop uprobes_mmap_hash(inode). No need to initialize this node or remove it from tmp_list, and we can use list_for_each_entry(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154353.GA9614@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b9c61bda9029..7d5c78f063ae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -513,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); - INIT_LIST_HEAD(&uprobe->pending_list); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -1037,7 +1036,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; int ret, count; @@ -1055,10 +1054,9 @@ int uprobe_mmap(struct vm_area_struct *vma) ret = 0; count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); if (!ret) { vaddr = vma_address(vma, uprobe->offset); @@ -1106,7 +1104,7 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) @@ -1123,12 +1121,10 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); vaddr = vma_address(vma, uprobe->offset); - if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before -- cgit v1.2.3 From 816c03fbabe64fa09f66fbb64e932081af381415 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:55 +0200 Subject: uprobes: Don't use loff_t for the valid virtual address loff_t looks confusing when it is used for the virtual address. Change map_info and install_breakpoint/remove_breakpoint paths to use "unsigned long". The patch doesn't change vma_address(), it can't return "long" because it is used to verify the mapping. But probably this needs some cleanups too. Signed-off-by: Oleg Nesterov Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154355.GA9622@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7d5c78f063ae..4df84b76dd48 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -664,9 +664,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, loff_t vaddr) + struct vm_area_struct *vma, unsigned long vaddr) { - unsigned long addr; int ret; /* @@ -679,8 +678,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return -EEXIST; - addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); if (ret) @@ -689,7 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -ENOTSUPP; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) return ret; @@ -709,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence increment before and decrement on failure. */ atomic_inc(&mm->uprobes_state.count); - ret = set_swbp(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, vaddr); if (ret) atomic_dec(&mm->uprobes_state.count); @@ -717,9 +714,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static void -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) atomic_dec(&mm->uprobes_state.count); } @@ -743,7 +740,7 @@ static void delete_uprobe(struct uprobe *uprobe) struct map_info { struct map_info *next; struct mm_struct *mm; - loff_t vaddr; + unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) @@ -837,7 +834,6 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - loff_t vaddr; if (err) goto free; @@ -847,9 +843,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (!vma || !valid_vma(vma, is_register)) goto unlock; - vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != info->vaddr) + vma_address(vma, uprobe->offset) != info->vaddr) goto unlock; if (is_register) { @@ -1055,10 +1050,8 @@ int uprobe_mmap(struct vm_area_struct *vma) count = 0; list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; - if (!ret) { - vaddr = vma_address(vma, uprobe->offset); + loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { put_uprobe(uprobe); @@ -1122,9 +1115,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, &tmp_list); list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; + loff_t vaddr = vma_address(vma, uprobe->offset); - vaddr = vma_address(vma, uprobe->offset); if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before -- cgit v1.2.3 From 593609a59600c8377f311b300f14deacb155b9a4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:59 +0200 Subject: uprobes: __copy_insn() needs "loff_t offset" 1. __copy_insn() needs "loff_t offset", not "unsigned long", to read the file. 2. use pgoff_t for "idx" and remove the unnecessary typecast. 3. fix the typo, "&=" is not what we want 4. can't resist, rename off1 to off. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154359.GA9625@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4df84b76dd48..d1b2eeb80837 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -581,12 +581,12 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) static int __copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, unsigned long offset) + unsigned long nbytes, loff_t offset) { struct page *page; void *vaddr; - unsigned long off1; - unsigned long idx; + unsigned long off; + pgoff_t idx; if (!filp) return -EINVAL; @@ -594,8 +594,8 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, if (!mapping->a_ops->readpage) return -EIO; - idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); - off1 = offset &= ~PAGE_MASK; + idx = offset >> PAGE_CACHE_SHIFT; + off = offset & ~PAGE_MASK; /* * Ensure that the page that has the original instruction is @@ -606,7 +606,7 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, return PTR_ERR(page); vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off1, nbytes); + memcpy(insn, vaddr + off, nbytes); kunmap_atomic(vaddr); page_cache_release(page); -- cgit v1.2.3 From e227051b13956b8f71c0abecc41ad351e64671c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:44:01 +0200 Subject: uprobes: Remove the unnecessary initialization in add_utask() Trivial cleanup. No need to nullify ->active_uprobe after kzalloc(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154401.GA9633@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1b2eeb80837..f93532748bca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1392,7 +1392,6 @@ static struct uprobe_task *add_utask(void) if (unlikely(!utask)) return NULL; - utask->active_uprobe = NULL; current->utask = utask; return utask; } -- cgit v1.2.3 From 4a77a5a06ec66ed05199b301e7c25f42f979afdc Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 21:21:51 +0800 Subject: printk: use mutex lock to stop syslog_seq from going wild Although syslog_seq and log_next_seq stuff are protected by logbuf_lock spin log, it's not enough. Say we have two processes A and B, and let syslog_seq = N, while log_next_seq = N + 1, and the two processes both come to syslog_print at almost the same time. And No matter which process get the spin lock first, it will increase syslog_seq by one, then release spin lock; thus later, another process increase syslog_seq by one again. In this case, syslog_seq is bigger than syslog_next_seq. And latter, it would make: wait_event_interruptiable(log_wait, syslog != log_next_seq) don't wait any more even there is no new write comes. Thus it introduce a infinite loop reading. I can easily see this kind of issue by the following steps: # cat /proc/kmsg # at meantime, I don't kill rsyslog # So they are the two processes. # xinit # I added drm.debug=6 in the kernel parameter line, # so that it will produce lots of message and let that # issue happen It's 100% reproducable on my side. And my disk will be filled up by /var/log/messages in a quite short time. So, introduce a mutex_lock to stop syslog_seq from going wild just like what devkmsg_read() does. It does fix this issue as expected. v2: use mutex_lock_interruptiable() instead (comments from Kay) Signed-off-by: Yuanhan Liu Reviewed-by: Fengguang Wu Acked-By: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ceb4a2f775a1..572730bd8a5c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -414,7 +414,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (!user) return -EBADF; - mutex_lock(&user->lock); + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { @@ -976,6 +978,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; + static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1002,11 +1005,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } + error = mutex_lock_interruptible(&syslog_mutex); + if (error) + goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) + if (error) { + mutex_unlock(&syslog_mutex); goto out; + } error = syslog_print(buf, len); + mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: -- cgit v1.2.3 From b56a39ac263e5b8cafedd551a49c2105e68b98c2 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 12:40:55 +0800 Subject: printk: return -EINVAL if the message len is bigger than the buf size Just like what devkmsg_read() does, return -EINVAL if the message len is bigger than the buf size, or it will trigger a segfault error. Acked-by: Kay Sievers Acked-by: Fengguang Wu Signed-off-by: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 572730bd8a5c..a2276b916769 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,7 +880,9 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > 0 && copy_to_user(buf, text, len)) + if (len > size) + len = -EINVAL; + else if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v1.2.3 From 9c5da09d266ca9b32eb16cf940f8161d949c2fe5 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 15:31:09 -0700 Subject: perf: Use css_tryget() to avoid propping up css refcount An rmdir pushes css's ref count to zero. However, if the associated directory is open at the time, the dentry ref count is non-zero. If the fd for this directory is then passed into perf_event_open, it does a css_get(). This bounces the ref count back up from zero. This is a problem by itself. But what makes it turn into a crash is the fact that we end up doing an extra dput, since we perform a dput when css_put sees the ref count go down to zero. css_tryget() does not fall into that trap. So, we use that instead. Reproduction test-case for the bug: #include #include #include #include #include #include #include #include #include #define PERF_FLAG_PID_CGROUP (1U << 2) int perf_event_open(struct perf_event_attr *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open,hw_event_uptr, pid, cpu, group_fd, flags); } /* * Directly poke at the perf_event bug, since it's proving hard to repro * depending on where in the kernel tree. what moved? */ int main(int argc, char **argv) { int fd; struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.exclude_kernel = 1; attr.size = sizeof(attr); mkdir("/dev/cgroup/perf_event/blah", 0777); fd = open("/dev/cgroup/perf_event/blah", O_RDONLY); perror("open"); rmdir("/dev/cgroup/perf_event/blah"); sleep(2); perf_event_open(&attr, fd, 0, -1, PERF_FLAG_PID_CGROUP); perror("perf_event_open"); close(fd); return 0; } Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/20120614223108.1025.2503.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c0154b333..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor -- cgit v1.2.3 From fbfc623f8231c8d8c78aab5841e9c6e5811ab638 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:31 +0800 Subject: perf: Avoid race between cpu hotplug and installing event perf_event_open() requires the cpu on which to install event is online, but the cpu can go offline after perf_event_open checks that. Add a get_online_cpus()/put_online_cpus() pair to avoid the race. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7d71d6ec972..31d182e01549 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6252,6 +6252,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + get_online_cpus(); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { @@ -6391,6 +6393,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + put_online_cpus(); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6419,6 +6423,7 @@ err_context: err_alloc: free_event(event); err_task: + put_online_cpus(); if (task) put_task_struct(task); err_group_fd: -- cgit v1.2.3 From e2d37cd213dcc0aeb3db4b37b9bd1710fe36fbf7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:32 +0800 Subject: perf: Allow the PMU driver to choose the CPU on which to install events Allow the pmu->event_init callback to change event->cpu, so the PMU driver can choose the CPU on which to install events. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-4-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 31d182e01549..fa36a39e8bb7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6306,7 +6306,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6379,16 +6379,16 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From 0cda4c023132aa93f2dd94811061f812e88daf4c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:33 +0800 Subject: perf: Introduce perf_pmu_migrate_context() Originally from Peter Zijlstra. The helper migrates perf events from one cpu to another cpu. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-5-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa36a39e8bb7..f1cf0edeb39a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -6379,6 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { + synchronize_rcu(); perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -6484,6 +6487,39 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + mutex_lock(&src_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + mutex_unlock(&src_ctx->mutex); + + synchronize_rcu(); + + mutex_lock(&dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { -- cgit v1.2.3 From 8e3bbf42c6d73881956863cc3305456afe2bc4ea Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 14:55:30 -0700 Subject: cgroups: Account for CSS_DEACT_BIAS in __css_put When we fixed the race between atomic_dec and css_refcnt, we missed the fact that css_refcnt internally subtracts CSS_DEACT_BIAS to get the actual reference count. This can potentially cause a refcount leak if __css_put races with cgroup_clear_css_refs. Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ceeafe874b3f..2097684cf194 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ + return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} + /* the current nr of refs, always >= 0 whether @css is deactivated or not */ static int css_refcnt(struct cgroup_subsys_state *css) { int v = atomic_read(&css->refcnt); - return v >= 0 ? v : v - CSS_DEACT_BIAS; + return css_unbias_refcnt(v); } /* convenient tests for these bits */ @@ -4982,9 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int v; rcu_read_lock(); - switch (atomic_dec_return(&css->refcnt)) { + v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + + switch (v) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.3 From 4fe7efdbdfb1c7e7a7f31decfd831c0f31d37091 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Jun 2012 12:53:01 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec do_exit() and exec_mmap() call sync_mm_rss() before mm_release() does put_user(clear_child_tid) which can update task->rss_stat and thus make mm->rss_stat inconsistent. This triggers the "BUG:" printk in check_mm(). Let's fix this bug in the safest way, and optimize/cleanup this later. Reported-by: Markus Trippelsdorf Signed-off-by: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..c0277d3f1aaa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -643,6 +643,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state -- cgit v1.2.3 From 6347e90091041e34bea625370794c92f4ce71228 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Jun 2012 12:53:03 -0700 Subject: pidns: guarantee that the pidns init will be the last pidns process reaped Today we have a twofold bug. Sometimes release_task on pid == 1 in a pid namespace can run before other processes in a pid namespace have had release task called. With the result that pid_ns_release_proc can be called before the last proc_flus_task() is done using upid->ns->proc_mnt, resulting in the use of a stale pointer. This same set of circumstances can lead to waitpid(...) returning for a processes started with clone(CLONE_NEWPID) before the every process in the pid namespace has actually exited. To fix this modify zap_pid_ns_processess wait until all other processes in the pid namespace have exited, even EXIT_DEAD zombies. The delay_group_leader and related tests ensure that the thread gruop leader will be the last thread of a process group to be reaped, or to become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes we get the guarantee that pid == 1 in a pid namespace will be the last task that release_task is called on. With pid == 1 being the last task to pass through release_task pid_ns_release_proc can no longer be called too early nor can wait return before all of the EXIT_DEAD tasks in a pid namespace have exited. Signed-off-by: Eric W. Biederman Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 +++++++++++++- kernel/pid_namespace.c | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c0277d3f1aaa..a85efd2348bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(p)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } + detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); + /* Firstly reap the EXIT_ZOMBIE children we may have. */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + /* + * sys_wait4() above can't reap the TASK_DEAD children. + * Make sure they all go away, see __unhash_process(). + */ + for (;;) { + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) + break; + schedule(); + } + if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; -- cgit v1.2.3 From 50d75f8daead8a1f850c40a3b6c6575ab19b48cf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: pidns: find_new_reaper() can no longer switch to init_pid_ns.child_reaper find_new_reaper() changes pid_ns->child_reaper, see add0d4df ("pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing"). The original reason has gone away after the previous patch, ->children list must be empty after zap_pid_ns_processes(). However now we can not switch to init_pid_ns.child_reaper. __unhash_process() relies on the "->child_reaper == parent" check, but this check does not work if the last exiting task is also the child reaper. As Eric sugested, we can change __unhash_process() to use the parent's pid_ns and remove this code. Also, with this change we can move detach_pid(PIDTYPE_PID) back, where it was before the previous fix. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a85efd2348bd..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; + detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -78,13 +79,12 @@ static void __unhash_process(struct task_struct *p, bool group_dead) if (IS_ENABLED(CONFIG_PID_NS)) { struct task_struct *parent = p->real_parent; - if ((task_active_pid_ns(p)->child_reaper == parent) && + if ((task_active_pid_ns(parent)->child_reaper == parent) && list_empty(&parent->children) && (parent->flags & PF_EXITING)) wake_up_process(parent); } } - detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } @@ -732,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; } else if (father->signal->has_child_subreaper) { struct task_struct *reaper; -- cgit v1.2.3 From 5702c5eeab959e86ee2d9b4fe7f2d87e65b25d46 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: c/r: prctl: Move PR_GET_TID_ADDRESS to a proper place During merging of PR_GET_TID_ADDRESS patch the code has been misplaced (it happened to appear under PR_MCE_KILL) in result noone can use this option. Fix it by moving code snippet to a proper place. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44dcd415..e0c8ffc50d7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; default: return -EINVAL; } @@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; error = 0; -- cgit v1.2.3 From 6648bd7e0e62c0c8c03b15e00c9e7015e232feff Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 21 Jun 2012 13:58:31 +0000 Subject: ipv4: Add sysctl knob to control early socket demux This change is meant to add a control for disabling early socket demux. The main motivation behind this patch is to provide an option to disable the feature as it adds an additional cost to routing that reduces overall throughput by up to 5%. For example one of my systems went from 12.1Mpps to 11.6 after the early socket demux was added. It looks like the reason for the regression is that we are now having to perform two lookups, first the one for an established socket, and then the one for the routing table. By adding this patch and toggling the value for ip_early_demux to 0 I am able to get back to the 12.1Mpps I was previously seeing. [ Move local variables in ip_rcv_finish() down into the basic block in which they are actually used. -DaveM ] Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- kernel/sysctl_binary.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a1..6a3cf8253aae 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ + { CTL_INT, NET_IPV4_EARLY_DEMUX, "ip_early_demux" }, + { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, /* NET_TCP_DEFAULT_WIN_SCALE unused */ -- cgit v1.2.3 From dfbce08c19cba2ba4faaf8c0dd6d7678f46c78dd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 22 Jun 2012 23:02:22 -0700 Subject: ipv4: Don't add deprecated new binary sysctl value. Reported-by: Eric Dumazet Signed-off-by: David S. Miller --- kernel/sysctl_binary.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6a3cf8253aae..a650694883a1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -415,8 +415,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ - { CTL_INT, NET_IPV4_EARLY_DEMUX, "ip_early_demux" }, - { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, /* NET_TCP_DEFAULT_WIN_SCALE unused */ -- cgit v1.2.3 From 4661e3568a7d14a93d4e428d246cdb86f4bac6e7 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Jun 2012 17:12:19 -0400 Subject: printk: fix regression in SYSLOG_ACTION_CLEAR Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62 (printk: convert byte-buffer to variable-length record buffer) introduced a regression by accidentally removing a "break" statement from inside the big switch in printk's do_syslog(). The symptom of this bug is that the "dmesg -C" command doesn't only clear the kernel's log buffer; it also disables console logging. This patch (as1561) fixes the regression by adding the missing "break". Signed-off-by: Alan Stern CC: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a2276b916769..d6a1412f6b09 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1040,6 +1040,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_print_all(NULL, 0, true); + break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) -- cgit v1.2.3 From b41772abebc27c61dd578b76da99aa5240b4c99a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 20:50:42 -0700 Subject: rcu: Stop rcu_do_batch() from multiplexing the "count" variable Commit b1420f1c (Make rcu_barrier() less disruptive) rearranged the code in rcu_do_batch(), moving the ->qlen manipulation to follow the requeueing of the callbacks. Unfortunately, this rearrangement clobbered the value of the "count" local variable before the value of rdp->qlen was adjusted, resulting in the value of rdp->qlen being inaccurate. This commit therefore introduces an index variable "i", avoiding the inadvertent multiplexing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3b0f1337f75b..38ecdda3f55f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1530,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy; + int bl, count, count_lazy, i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1553,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; local_irq_restore(flags); /* Invoke callbacks. */ @@ -1583,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; else break; } -- cgit v1.2.3 From 6fda135c908d0f38a0167adcbd71094572e3059b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 26 Jun 2012 12:35:24 -0700 Subject: Revert "printk: return -EINVAL if the message len is bigger than the buf size" This reverts commit b56a39ac263e5b8cafedd551a49c2105e68b98c2. A better patch from Jan will follow this to resolve the issue. Acked-by: Kay Sievers Cc: Fengguang Wu Cc: Yuanhan Liu Cc: Jan Beulich Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d6a1412f6b09..ff05361962e1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,9 +880,7 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > size) - len = -EINVAL; - else if (len > 0 && copy_to_user(buf, text, len)) + if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v1.2.3 From 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 22 Jun 2012 16:36:09 +0100 Subject: syslog: fill buffer with more than a single message for SYSLOG_ACTION_READ The recent changes to the printk buffer management resulted in SYSLOG_ACTION_READ to only return a single message, whereas previously the buffer would get filled as much as possible. As, when too small to fit everything, filling it to the last byte would be pretty ugly with the new code, the patch arranges for as many messages as possible to get returned in a single invocation. User space tools in at least all SLES versions depend on the old behavior. This at once addresses the issue attempted to get fixed with commit b56a39ac263e5b8cafedd551a49c2105e68b98c2 ("printk: return -EINVAL if the message len is bigger than the buf size"), and since that commit widened the possibility for losing a message altogether, the patch here assumes that this other commit would get reverted first (otherwise the patch here won't apply). Furthermore, this patch also addresses the problem dealt with in commit 4a77a5a06ec66ed05199b301e7c25f42f979afdc ("printk: use mutex lock to stop syslog_seq from going wild"), so I'd recommend reverting that one too (albeit there's no direct collision between the two). Signed-off-by: Jan Beulich Acked-by: Kay Sievers Cc: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 51 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ff05361962e1..cdfba44fedf0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -862,26 +862,49 @@ static int syslog_print(char __user *buf, int size) { char *text; struct log *msg; - int len; + int len = 0; text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - } - msg = log_from_idx(syslog_idx); - len = msg_print_text(msg, true, text, LOG_LINE_MAX); - syslog_idx = log_next(syslog_idx); - syslog_seq++; - raw_spin_unlock_irq(&logbuf_lock); + while (size > 0) { + size_t n; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, true, text, LOG_LINE_MAX); + if (n <= size) { + syslog_idx = log_next(syslog_idx); + syslog_seq++; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; - if (len > 0 && copy_to_user(buf, text, len)) - len = -EFAULT; + len += n; + size -= n; + buf += n; + n = copy_to_user(buf - n, text, n); + + if (n) { + len -= n; + if (!len) + len = -EFAULT; + break; + } + } kfree(text); return len; -- cgit v1.2.3 From c64e66c67b574f25a048886807c2007d17d50d0a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 26 Jun 2012 21:45:21 -0700 Subject: audit: netlink: Move away from NLMSG_NEW(). And use nlmsg_data() while we're here too. Signed-off-by: David S. Miller --- kernel/audit.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 1c7f2c61416b..30b252a1fb61 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb) static void audit_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - char *data = NLMSG_DATA(nlh); + char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE) { if (printk_ratelimit()) @@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, if (!skb) return NULL; - nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); - data = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, pid, seq, t, size, flags); + if (!nlh) + goto out_kfree_skb; + data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_NEW */ - if (skb) - kfree_skb(skb); +out_kfree_skb: + kfree_skb(skb); return NULL; } @@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); + data = nlmsg_data(nlh); switch (msg_type) { case AUDIT_GET: @@ -1060,13 +1061,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) - goto nlmsg_failure; + goto err; - nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); + if (!nlh) + goto out_kfree_skb; return ab; -nlmsg_failure: /* Used by NLMSG_NEW */ +out_kfree_skb: kfree_skb(ab->skb); ab->skb = NULL; err: -- cgit v1.2.3 From 0be61ebc18b919dddbdbcd1c4f42513c310ecf59 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 18 Jun 2012 09:28:16 -0400 Subject: tracing/selftest: Add a WARN_ON() if a tracer test fails Add a WARN_ON() output on test failures so that they are easier to detect in automated tests. Although, the WARN_ON() will not print if the test causes the system to crash, obviously. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690d..748f6401edf6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ -- cgit v1.2.3 From 6d158a813efcd09661c23f16ddf7e2ff834cb20c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Jun 2012 20:46:14 -0400 Subject: tracing: Remove NR_CPUS array from trace_iterator Replace the NR_CPUS array of buffer_iter from the trace_iterator with an allocated array. This will just create an array of possible CPUS instead of the max number specified. The use of NR_CPUS in that array caused allocation failures for machines that were tight on memory. This did not cause any failures to the system itself (no crashes), but caused unnecessary failures for reading the trace files. Added a helper function called 'trace_buffer_iter()' that returns the buffer_iter item or NULL if it is not defined or the array was not allocated. Some routines do not require the array (tracing_open_pipe() for one). Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 27 ++++++++++++++++++--------- kernel/trace/trace.h | 8 ++++++++ kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 748f6401edf6..b2af14e94c28 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1710,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1720,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); @@ -1858,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) tr->data[cpu]->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -2207,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2223,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2383,6 +2388,8 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2443,6 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } @@ -2483,6 +2491,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -317,6 +317,14 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) -- cgit v1.2.3 From a5fb833172eca69136e9ee1ada778e404086ab8a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Jun 2012 13:35:04 -0400 Subject: ring-buffer: Fix uninitialized read_stamp The ring buffer reader page is used to swap a page from the writable ring buffer. If the writer happens to be on that page, it ends up on the reader page, but will simply move off of it, back into the writable ring buffer as writes are added. The time stamp passed back to the readers is stored in the cpu_buffer per CPU descriptor. This stamp is updated when a swap of the reader page takes place, and it reads the current stamp from the page taken from the writable ring buffer. Everytime a writer goes to a new page, it updates the time stamp of that page. The problem happens if a reader reads a page from an empty per CPU ring buffer. If the buffer is empty, the swap still takes place, placing the writer at the start of the reader page. If at a later time, a write happens, it updates the page's time stamp and continues. But the problem is that the read_stamp does not get updated, because the page was already swapped. The solution to this was to not swap the page if the ring buffer happens to be empty. This also removes the side effect that the writes on the reader page will not get updated because the writer never gets back on the reader page without a swap. That is, if a read happens on an empty buffer, but then no reads happen for a while. If a swap took place, and the writer were to start writing a lot of data (function tracer), it will start overflowing the ring buffer and overwrite the older data. But because the writer never goes back onto the reader page, the data left on the reader page never gets overwritten. This causes the reader to see really old data, followed by a jump to newer data. Link: http://lkml.kernel.org/r/1340060577-9112-1-git-send-email-dhsharp@google.com Google-Bug-Id: 6410455 Reported-by: David Sharp tested-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..82a3e0c56b1d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ -- cgit v1.2.3 From 084681d14e429cb6192262ac7437f00e2c02f26a Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 28 Jun 2012 09:38:53 +0200 Subject: printk: flush continuation lines immediately to console Continuation lines are buffered internally, intended to merge the chunked printk()s into a single record, and to isolate potentially racy continuation users from usual terminated line users. This though, has the effect that partial lines are not printed to the console in the moment they are emitted. In case the kernel crashes in the meantime, the potentially interesting printed information would never reach the consoles. Here we share the continuation buffer with the console copy logic, and partial lines are always immediately flushed to the available consoles. They are still buffered internally to improve the readability and integrity of the messages and minimize the amount of needed record headers to store. Signed-off-by: Kay Sievers Tested-by: Steven Rostedt Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 244 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 176 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index cdfba44fedf0..fbf4d0b22a1d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -193,12 +193,19 @@ static int console_may_schedule; * separated by ',', and find the message after the ';' character. */ +enum log_flags { + LOG_DEFAULT = 0, + LOG_NOCONS = 1, /* already flushed, do not print to console */ +}; + struct log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ - u16 level; /* syslog level + facility */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ }; /* @@ -286,6 +293,7 @@ static u32 log_next(u32 idx) /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -329,8 +337,13 @@ static void log_store(int facility, int level, msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; - msg->level = (facility << 3) | (level & 7); - msg->ts_nsec = local_clock(); + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = sizeof(struct log) + text_len + dict_len + pad_len; @@ -446,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); len = sprintf(user->buf, "%u,%llu,%llu;", - msg->level, user->seq, ts_usec); + (msg->facility << 3) | msg->level, user->seq, ts_usec); /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -787,6 +800,21 @@ static bool printk_time; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + if (!buf) + return 15; + + rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; @@ -803,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) } } - if (printk_time) { - if (buf) { - unsigned long long ts = msg->ts_nsec; - unsigned long rem_nsec = do_div(ts, 1000000000); - - len += sprintf(buf + len, "[%5lu.%06lu] ", - (unsigned long) ts, rem_nsec / 1000); - } else { - len += 15; - } - } - + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } @@ -1294,15 +1311,92 @@ static inline void printk_delay(void) } } +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(void) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + + cont.flushed = true; +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + cont_flush(); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { static int recursion_bug; - static char cont_buf[LOG_LINE_MAX]; - static size_t cont_len; - static int cont_level; - static struct task_struct *cont_task; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; @@ -1348,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, NULL, 0, recursion_msg, printed_len); + log_store(0, 2, LOG_DEFAULT, 0, + NULL, 0, recursion_msg, printed_len); } /* @@ -1386,55 +1481,38 @@ asmlinkage int vprintk_emit(int facility, int level, } if (!newline) { - if (cont_len && (prefix || cont_task != current)) { - /* - * Flush earlier buffer, which is either from a - * different thread, or when we got a new prefix. - */ - log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); - cont_len = 0; - } - - if (!cont_len) { - cont_level = level; - cont_task = current; - } + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (prefix || cont.owner != current)) + cont_flush(); - /* buffer or append to earlier buffer from the same thread */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } else { - if (cont_len && cont_task == current) { - if (prefix) { - /* - * New prefix from the same thread; flush. We - * either got no earlier newline, or we race - * with an interrupt. - */ - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + bool stored = false; - /* append to the earlier buffer and flush */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - cont_task = NULL; - printed_len = cont_len; - } else { - /* ordinary single and terminated line */ - log_store(facility, level, - dict, dictlen, text, text_len); - printed_len = text_len; + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or we race with a continuation line from an interrupt. + */ + if (cont.len && prefix && cont.owner == current) + cont_flush(); + + /* Merge with our buffer if possible; flush it in any case */ + if (cont.len && cont.owner == current) { + stored = cont_add(facility, level, text, text_len); + cont_flush(); } + + if (!stored) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } + printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. @@ -1521,11 +1599,18 @@ EXPORT_SYMBOL(printk); #else #define LOG_LINE_MAX 0 +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} static size_t msg_print_text(const struct log *msg, bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1817,6 +1902,7 @@ static u32 console_idx; */ void console_unlock(void) { + static char text[LOG_LINE_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1829,10 +1915,23 @@ void console_unlock(void) console_may_schedule = 0; + /* flush buffered message fragment immediately to console */ + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (cont.len && (cont.cons < cont.len || cont.flushed)) { + size_t len; + + len = cont_print_text(text, sizeof(text)); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + } else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + again: for (;;) { struct log *msg; - static char text[LOG_LINE_MAX]; size_t len; int level; @@ -1847,13 +1946,22 @@ again: console_seq = log_first_seq; console_idx = log_first_idx; } - +skip: if (console_seq == log_next_seq) break; msg = log_from_idx(console_idx); - level = msg->level & 7; + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + goto skip; + } + level = msg->level; len = msg_print_text(msg, false, text, sizeof(text)); console_idx = log_next(console_idx); -- cgit v1.2.3 From 44b99462d9d776522e174d6c531ce5ccef309e26 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 22 Jun 2012 11:50:05 -0700 Subject: ring-buffer: Fix crash due to uninitialized new_pages list head The new_pages list head in the cpu_buffer is not initialized. When adding pages to the ring buffer, if the memory allocation fails in ring_buffer_resize, the clean up handler tries to free up the allocated pages from all the cpu buffers. The panic is caused by referencing the uninitialized new_pages list head. Initializing the new_pages list head in rb_allocate_cpu_buffer fixes this. Link: http://lkml.kernel.org/r/1340391005-10880-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..ba39cbabdc9f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) -- cgit v1.2.3 From 48fdc72f23ad9a9956e524a47843135d0bbc3317 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 29 Jun 2012 12:31:41 -0700 Subject: ring-buffer: Fix accounting of entries when removing pages When removing pages from the ring buffer, its state is not reset. This means that the counters need to be correctly updated to account for the pages removed. Update the overrun counter to reflect the removed events from the pages. Link: http://lkml.kernel.org/r/1340998301-1715-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba39cbabdc9f..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1347,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } -- cgit v1.2.3 From d36208227d03c44c0a74cd702cc94528162e1703 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Jun 2012 11:40:11 -0400 Subject: printk: Optimize if statement logic where newline exists In reviewing Kay's fix up patch: "printk: Have printk() never buffer its data", I found two if statements that could be combined and optimized. Put together the two 'cont.len && cont.owner == current' if statements into a single one, and check if we need to call cont_add(). This also removes the unneeded double cont_flush() calls. Link: http://lkml.kernel.org/r/1340869133.876.10.camel@mop Signed-off-by: Steven Rostedt Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fbf4d0b22a1d..5ae6b09e3805 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1496,15 +1496,14 @@ asmlinkage int vprintk_emit(int facility, int level, bool stored = false; /* - * Flush the conflicting buffer. An earlier newline was missing, - * or we race with a continuation line from an interrupt. + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. */ - if (cont.len && prefix && cont.owner == current) - cont_flush(); - - /* Merge with our buffer if possible; flush it in any case */ if (cont.len && cont.owner == current) { - stored = cont_add(facility, level, text, text_len); + if (!prefix) + stored = cont_add(facility, level, text, text_len); cont_flush(); } -- cgit v1.2.3 From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Jun 2012 06:15:21 +0000 Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create This patch adds the following structure: struct netlink_kernel_cfg { unsigned int groups; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; }; That can be passed to netlink_kernel_create to set optional configurations for netlink kernel sockets. I've populated this structure by looking for NULL and zero parameters at the existing code. The remaining parameters that always need to be set are still left in the original interface. That includes optional parameters for the netlink socket creation. This allows easy extensibility of this interface in the future. This patch also adapts all callers to use this new interface. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- kernel/audit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 30b252a1fb61..4a3f28d2ca65 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else -- cgit v1.2.3 From 4f0f4af59cb07bcf44d3c07a9e8c26df54d9fff8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Jun 2012 15:37:24 -0700 Subject: printk.c: fix kernel-doc warnings Fix kernel-doc warnings in printk.c: use correct parameter name. Warning(kernel/printk.c:2429): No description found for parameter 'buf' Warning(kernel/printk.c:2429): Excess function parameter 'line' description in 'kmsg_dump_get_buffer' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 5ae6b09e3805..dba18211685e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -2538,7 +2538,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * kmsg_dump_get_buffer - copy kmsg log lines * @dumper: registered kmsg dumper * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to + * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * -- cgit v1.2.3 From 62c552ccc3eda1198632a4f344aa32623d226bab Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Sat, 16 Jun 2012 00:09:58 +0200 Subject: PM / Hibernate: Enable suspend to both for in-kernel hibernation. It is often useful to suspend to memory after hibernation image has been written to disk. If the battery runs out or power is otherwise lost, the computer will resume from the hibernated image. If not, it will resume from memory and hibernation image will be discarded. Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/power/power.h | 3 +++ kernel/power/swap.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..21ad3fe3164f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -5,6 +5,7 @@ * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver * * This file is released under the GPLv2. */ @@ -46,6 +47,9 @@ enum { HIBERNATION_PLATFORM, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -574,6 +578,10 @@ int hibernation_platform_enter(void) */ static void power_down(void) { +#ifdef CONFIG_SUSPEND + int error; +#endif + switch (hibernation_mode) { case HIBERNATION_REBOOT: kernel_restart(NULL); @@ -583,6 +591,25 @@ static void power_down(void) case HIBERNATION_SHUTDOWN: kernel_power_off(); break; +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + if (hibernation_ops) + hibernation_mode = HIBERNATION_PLATFORM; + else + hibernation_mode = HIBERNATION_SHUTDOWN; + power_down(); + } + /* + * Restore swap signature. + */ + error = swsusp_unmark(); + if (error) + printk(KERN_ERR "PM: Swap will be unusable! " + "Try swapon -a.\n"); + return; +#endif } kernel_halt(); /* @@ -827,6 +854,9 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif }; /* @@ -867,6 +897,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -907,6 +940,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif hibernation_mode = mode; break; case HIBERNATION_PLATFORM: diff --git a/kernel/power/power.h b/kernel/power/power.h index b0bd4beaebfe..7d4b7ffb3c1d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -156,6 +156,9 @@ extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(fmode_t); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif /* kernel/power/block_io.c */ extern struct block_device *hib_resume_bdev; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11e22c068e8b..83d505142b00 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode) blkdev_put(hib_resume_bdev, mode); } +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; +} +#endif + static int swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); -- cgit v1.2.3 From 443772d408a25af62498793f6f805ce3c559309a Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 16 Jun 2012 15:30:45 +0200 Subject: ftrace: Disable function tracing during suspend/resume and hibernation, again If function tracing is enabled for some of the low-level suspend/resume functions, it leads to triple fault during resume from suspend, ultimately ending up in a reboot instead of a resume (or a total refusal to come out of suspended state, on some machines). This issue was explained in more detail in commit f42ac38c59e0a03d (ftrace: disable tracing for suspend to ram). However, the changes made by that commit got reverted by commit cbe2f5a6e84eebb (tracing: allow tracing of suspend/resume & hibernation code again). So, unfortunately since things are not yet robust enough to allow tracing of low-level suspend/resume functions, suspend/resume is still broken when ftrace is enabled. So fix this by disabling function tracing during suspend/resume & hibernation. Signed-off-by: Srivatsa S. Bhat Cc: stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++++ kernel/power/suspend.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 21ad3fe3164f..0d4b354bc1be 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -358,6 +358,7 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -383,6 +384,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); + ftrace_start(); resume_console(); dpm_complete(msg); @@ -485,6 +487,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -492,6 +495,7 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); + ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -518,6 +522,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -561,6 +566,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + ftrace_start(); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 396d262b8fd0..c8b7446b27df 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "power.h" @@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + ftrace_start(); resume_console(); Close: if (suspend_ops->end) -- cgit v1.2.3 From 4b7760ba0dd3319f66886ab2335a0fbecdbc808a Mon Sep 17 00:00:00 2001 From: Sameer Nanda Date: Tue, 19 Jun 2012 22:23:33 +0200 Subject: PM / Sleep: add knob for printing device resume times Added a new knob called /sys/power/pm_print_times. Setting it to 1 enables printing of time taken by devices to suspend and resume. Setting it to 0 disables this printing (unless overridden by initcall_debug kernel command line option). Signed-off-by: Sameer Nanda Acked-by: Greg KH Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 428f8a034e96..7beb3fb3670b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -132,6 +132,38 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_test); + +/* + * pm_print_times: print time taken by devices to suspend and resume. + * + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. + */ +int pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_print_times_enabled); +} + +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_print_times_enabled = val; + return n; +} + +power_attr(pm_print_times); #endif /* CONFIG_PM_DEBUG */ #ifdef CONFIG_DEBUG_FS @@ -530,6 +562,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, + &pm_print_times_attr.attr, #endif #endif NULL, -- cgit v1.2.3 From b2df1d4f8b95d9d1e3f064cef02fc5c5116b05cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Jun 2012 00:19:33 +0200 Subject: PM / Sleep: Separate printing suspend times from initcall_debug Change the behavior of the newly introduced /sys/power/pm_print_times attribute so that its initial value depends on initcall_debug, but setting it to 0 will cause device suspend/resume times not to be printed, even if initcall_debug has been set. This way, the people who use initcall_debug for reasons other than PM debugging will be able to switch the suspend/resume times printing off, if need be. Signed-off-by: Rafael J. Wysocki Reviewed-by: Srivatsa S. Bhat Acked-by: Greg Kroah-Hartman --- kernel/power/Kconfig | 4 +-- kernel/power/main.c | 76 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 46 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 8f9b4eb974e0..a70518c9d82f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -175,7 +175,7 @@ config PM_TEST_SUSPEND You probably want to have your system's RTC driver statically linked, ensuring that it's available when this test runs. -config CAN_PM_TRACE +config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP @@ -196,7 +196,7 @@ config PM_TRACE config PM_TRACE_RTC bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE + depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE ---help--- diff --git a/kernel/power/main.c b/kernel/power/main.c index 7beb3fb3670b..f458238109cc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -132,38 +132,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_test); - -/* - * pm_print_times: print time taken by devices to suspend and resume. - * - * show() returns whether printing of suspend and resume times is enabled. - * store() accepts 0 or 1. 0 disables printing and 1 enables it. - */ -int pm_print_times_enabled; - -static ssize_t pm_print_times_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", pm_print_times_enabled); -} - -static ssize_t pm_print_times_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long val; - - if (kstrtoul(buf, 10, &val)) - return -EINVAL; - - if (val > 1) - return -EINVAL; - - pm_print_times_enabled = val; - return n; -} - -power_attr(pm_print_times); #endif /* CONFIG_PM_DEBUG */ #ifdef CONFIG_DEBUG_FS @@ -267,6 +235,47 @@ late_initcall(pm_debugfs_init); #endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. + * + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. + */ +bool pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_print_times_enabled); +} + +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_print_times_enabled = !!val; + return n; +} + +power_attr(pm_print_times); + +static inline void pm_print_times_init(void) +{ + pm_print_times_enabled = !!initcall_debug; +} +#else /* !CONFIG_PP_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ + struct kobject *power_kobj; /** @@ -562,6 +571,8 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, #endif #endif @@ -599,6 +610,7 @@ static int __init pm_init(void) error = sysfs_create_group(power_kobj, &attr_group); if (error) return error; + pm_print_times_init(); return pm_autosleep_init(); } -- cgit v1.2.3 From d8150d350408de6fb2b9ee7b7625ae8e2bb7aa4a Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Thu, 21 Jun 2012 22:27:24 +0200 Subject: PM / Hibernate: Print hibernation/thaw progress indicator one line at a time. With the introduction of suspend to both into in-kernel hibernation code, dmesg was getting polluted with backspace characters printed as part of image saving progress indicator. This patch introduces printing of progress indicator on image save/load every 10% and one line at a time. As an additional benefit, all other messages emitted by the kernel during hibernation/thaw should now print cleanly as well. Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 54 ++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 83d505142b00..3c9d764eb0d8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle, struct timeval start; struct timeval stop; - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) - printk(KERN_CONT "\b\b\b\bdone\n"); - else - printk(KERN_CONT "\n"); + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); return ret; } @@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages) ... ", + "PM: Compressing and saving image data (%u pages)...\n", nr_threads, nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", - nr_pages / m); + printk(KERN_INFO + "PM: Image saving progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -761,11 +762,8 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) { - printk(KERN_CONT "\b\b\b\bdone\n"); - } else { - printk(KERN_CONT "\n"); - } + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); out_clean: if (crc) { @@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle, int err2; unsigned nr_pages; - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return ret; } @@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages) ... ", + "PM: Loading and decompressing image data (%u pages)...\n", nr_threads, nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO + "PM: Image loading progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); @@ -1344,7 +1345,7 @@ out_finish: } do_gettimeofday(&stop); if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; @@ -1357,8 +1358,7 @@ out_finish: } } } - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) -- cgit v1.2.3 From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jul 2012 07:08:42 -0700 Subject: Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation" This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9. (Move PREEMPT_RCU preemption to switch_to() invocation). Testing by Sasha Levin showed that this can result in deadlock due to invoking the scheduler when one of the runqueue locks is held. Because this commit was simply a performance optimization, revert it. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Tested-by: Sasha Levin --- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 +++++++++++--- kernel/sched/core.c | 1 - 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -1001,6 +1001,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + /* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..eaead2df6aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From f885b7f2b2de70be266d2cecc476f773a1e2ca5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Apr 2012 15:52:53 -0700 Subject: rcu: Control RCU_FANOUT_LEAF from boot-time parameter Although making RCU_FANOUT_LEAF a kernel configuration parameter rather than a fixed constant makes it easier for people to decrease cache-miss overhead for large systems, it is of little help for people who must run a single pre-built kernel binary. This commit therefore allows the value of RCU_FANOUT_LEAF to be increased (but not decreased!) via a boot-time parameter named rcutree.rcu_fanout_leaf. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 97 ++++++++++++++++++++++++++++++++++++++++++------- kernel/rcutree.h | 23 +++++++----- kernel/rcutree_plugin.h | 2 + kernel/rcutree_trace.c | 2 +- 4 files changed, 99 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..a4c592b66e10 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,17 +60,10 @@ /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(structname) { \ .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ @@ -91,6 +84,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ + /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU @@ -2574,9 +2580,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i > 0; i--) + for (i = rcu_num_lvls - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; + rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2586,7 +2592,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int i; cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; @@ -2613,13 +2619,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ - for (i = 1; i < NUM_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); /* Initialize the elements themselves, starting from the leaves. */ - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2649,7 +2657,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; + rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; @@ -2658,11 +2666,72 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + int i; + int j; + int n = NR_CPUS; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; +} + void __init rcu_init(void) { int cpu; rcu_bootup_announce(); + rcu_init_geometry(); rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079f..780a0195d35a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -42,28 +42,28 @@ #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 -# define NUM_RCU_LVLS 1 +# define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_2 -# define NUM_RCU_LVLS 2 +# define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_3 -# define NUM_RCU_LVLS 3 +# define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_4 -# define NUM_RCU_LVLS 4 +# define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) @@ -76,6 +76,9 @@ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) +extern int rcu_num_lvls; +extern int rcu_num_nodes; + /* * Dynticks per-CPU state. */ @@ -206,7 +209,7 @@ struct rcu_node { */ #define rcu_for_each_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* * Do a breadth-first scan of the non-leaf rcu_node structures for the @@ -215,7 +218,7 @@ struct rcu_node { */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -224,8 +227,8 @@ struct rcu_node { * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ @@ -357,9 +360,9 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..fb92b6dd9980 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -70,6 +70,8 @@ static void __init rcu_bootup_announce_oddness(void) #if NUM_RCU_LVL_4 != 0 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); #endif + if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); } #ifdef CONFIG_TREE_PREEMPT_RCU diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..a3556a2d842c 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; -- cgit v1.2.3 From cc5df65b0370fc6aa2bfe3bb19e0451d5cafb99f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Jun 2012 18:16:00 -0700 Subject: rcu: Four-level hierarchy is no longer experimental Time to make the four-level-hierarchy setting less scary, so this commit removes "Experimental" from the boot-time message. Leave the message in order to get a heads-up on any possible need to expand to a five-level hierarchy. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index fb92b6dd9980..70e0fd256cc6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,7 +68,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); + printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); -- cgit v1.2.3 From cca6f3931920a7547d02e68adc2ca635bea5600c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2012 21:00:28 -0700 Subject: rcu: Size rcu_node tree from nr_cpu_ids rather than NR_CPUS The rcu_node tree array is sized based on compile-time constants, including NR_CPUS. Although this approach has worked well in the past, the recent trend by many distros to define NR_CPUS=4096 results in excessive grace-period-initialization latencies. This commit therefore substitutes the run-time computed nr_cpu_ids for the compile-time NR_CPUS when building the tree. This can result in much of the compile-time-allocated rcu_node array being unused. If this is a major problem, you are in a specialized situation anyway, so you can manually adjust the NR_CPUS, RCU_FANOUT, and RCU_FANOUT_LEAF kernel config parameters. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree_plugin.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a4c592b66e10..0fdbc5e07302 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2675,7 +2675,7 @@ static void __init rcu_init_geometry(void) { int i; int j; - int n = NR_CPUS; + int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 70e0fd256cc6..ef2b5231afa4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -72,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + if (nr_cpu_ids != NR_CPUS) + printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); } #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.2.3 From 6c90cc7bf077f28144013e949ee0c122012d194a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 19:41:41 -0700 Subject: rcu: Prevent excessive line length in RCU_STATE_INITIALIZER() Upcoming rcu_barrier() concurrency commits will result in line lengths greater than 80 characters in the RCU_STATE_INITIALIZER(), so this commit shortens the name of the macro's argument to prevent this. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0fdbc5e07302..dd7fd96c90c5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -62,18 +62,18 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ +#define RCU_STATE_INITIALIZER(sname) { \ + .level = { &sname##_state.node[0] }, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .orphan_nxttail = &structname##_state.orphan_nxtlist, \ - .orphan_donetail = &structname##_state.orphan_donelist, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ - .name = #structname, \ + .name = #sname, \ } struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); -- cgit v1.2.3 From 037b64ed0bf2405a1a01542164d3418564b44fff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 23:26:01 -0700 Subject: rcu: Place pointer to call_rcu() in rcu_data structure This is a preparatory commit for increasing rcu_barrier()'s concurrency. It adds a pointer in the rcu_data structure to the corresponding call_rcu() function. This allows a pointer to the rcu_data structure to imply the function pointer, which allows _rcu_barrier() state to be placed in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 27 ++++++++++++--------------- kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 5 +++-- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd7fd96c90c5..00c518fa34bb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -62,8 +62,9 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname) { \ +#define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ + .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ @@ -76,10 +77,11 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .name = #sname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = + RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -2282,21 +2284,17 @@ static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); + struct rcu_state *rsp = type; atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); + rsp->call(head, rcu_barrier_callback); } /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp) { int cpu; unsigned long flags; @@ -2348,8 +2346,7 @@ static void _rcu_barrier(struct rcu_state *rsp, while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { - smp_call_function_single(cpu, rcu_barrier_func, - (void *)call_rcu_func, 1); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { preempt_enable(); @@ -2370,7 +2367,7 @@ static void _rcu_barrier(struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rsp->onofflock, flags); atomic_inc(&rcu_barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - call_rcu_func(&rh, rcu_barrier_callback); + rsp->call(&rh, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2393,7 +2390,7 @@ static void _rcu_barrier(struct rcu_state *rsp, */ void rcu_barrier_bh(void) { - _rcu_barrier(&rcu_bh_state, call_rcu_bh); + _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2402,7 +2399,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(&rcu_sched_state, call_rcu_sched); + _rcu_barrier(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 780a0195d35a..049896a835d9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -364,6 +364,8 @@ struct rcu_state { u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ + void (*func)(struct rcu_head *head)); /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ef2b5231afa4..9cb3a68819fa 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -78,7 +78,8 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); +struct rcu_state rcu_preempt_state = + RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -944,7 +945,7 @@ static int rcu_preempt_cpu_has_callbacks(int cpu) */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state, call_rcu); + _rcu_barrier(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_barrier); -- cgit v1.2.3 From 06668efa9180f4824fe846a8ff96338c18646bc7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 23:57:46 -0700 Subject: rcu: Move _rcu_barrier()'s rcu_head structures to rcu_data structures In order for multiple flavors of RCU to each concurrently run one rcu_barrier(), each flavor needs its own per-CPU set of rcu_head structures. This commit therefore moves _rcu_barrier()'s set of per-CPU rcu_head structures from per-CPU variables to the existing per-CPU and per-RCU-flavor rcu_data structures. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 ++---- kernel/rcutree.h | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 00c518fa34bb..1e552598b55d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,7 +157,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; @@ -2282,12 +2281,11 @@ static void rcu_barrier_callback(struct rcu_head *notused) */ static void rcu_barrier_func(void *type) { - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); atomic_inc(&rcu_barrier_cpu_count); - rsp->call(head, rcu_barrier_callback); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); } /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 049896a835d9..586d93c978f2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -314,6 +314,9 @@ struct rcu_data { unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; + /* 6) _rcu_barrier() callback. */ + struct rcu_head barrier_head; + int cpu; struct rcu_state *rsp; }; -- cgit v1.2.3 From 24ebbca8ecdd5129d7f829a7cb5146aaeb531f77 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 00:34:56 -0700 Subject: rcu: Move rcu_barrier_cpu_count to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_cpu_count global variable to a new ->barrier_cpu_count field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 ++++++++++++++----------- kernel/rcutree.h | 1 + 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1e552598b55d..5929b021666d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,7 +157,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ -static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; @@ -2270,9 +2269,12 @@ static int rcu_cpu_has_callbacks(int cpu) * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp) { - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rcu_barrier_completion); } @@ -2284,7 +2286,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -2297,9 +2299,9 @@ static void _rcu_barrier(struct rcu_state *rsp) int cpu; unsigned long flags; struct rcu_data *rdp; - struct rcu_head rh; + struct rcu_data rd; - init_rcu_head_on_stack(&rh); + init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); @@ -2324,7 +2326,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * us -- but before CPU 1's orphaned callbacks are invoked!!! */ init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 1); + atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2363,15 +2365,16 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp); rsp->rcu_barrier_in_progress = NULL; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rsp->call(&rh, rcu_barrier_callback); + rd.rsp = rsp; + rsp->call(&rd.barrier_head, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rcu_barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ @@ -2380,7 +2383,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); - destroy_rcu_head_on_stack(&rh); + destroy_rcu_head_on_stack(&rd.barrier_head); } /** diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 586d93c978f2..c57ef0b7f097 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -400,6 +400,7 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + atomic_t barrier_cpu_count; /* # CPUs waiting on. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ -- cgit v1.2.3 From 7db74df88b52844f4e966901e2972bba725e6766 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 03:03:37 -0700 Subject: rcu: Move rcu_barrier_completion to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_completion global variable to a new ->barrier_completion field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 9 ++++----- kernel/rcutree.h | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5929b021666d..ca7d1678ac79 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -158,7 +158,6 @@ unsigned long rcutorture_vernum; /* State information for rcu_barrier() and friends. */ static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s @@ -2275,7 +2274,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rcu_barrier_completion); + complete(&rsp->barrier_completion); } /* @@ -2325,7 +2324,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * 6. Both rcu_barrier_callback() callbacks are invoked, awakening * us -- but before CPU 1's orphaned callbacks are invoked!!! */ - init_completion(&rcu_barrier_completion); + init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; @@ -2375,10 +2374,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * CPU, and thus each counted, remove the initial count. */ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rcu_barrier_completion); + complete(&rsp->barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rcu_barrier_completion); + wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c57ef0b7f097..d1ca4424122b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -401,6 +401,7 @@ struct rcu_state { /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ + struct completion barrier_completion; /* Wake at barrier end. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ -- cgit v1.2.3 From 7be7f0be907224445acc62b3884c892f38b7ff40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 05:18:53 -0700 Subject: rcu: Move rcu_barrier_mutex to rcu_state structure In order to allow each RCU flavor to concurrently execute its rcu_barrier() function, it is necessary to move the relevant state to the rcu_state structure. This commit therefore moves the rcu_barrier_mutex global variable to a new ->barrier_mutex field in the rcu_state structure. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 9 +++------ kernel/rcutree.h | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ca7d1678ac79..ff992ac24e73 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -71,6 +71,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -155,10 +156,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_MUTEX(rcu_barrier_mutex); - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -2303,7 +2300,7 @@ static void _rcu_barrier(struct rcu_state *rsp) init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); + mutex_lock(&rsp->barrier_mutex); smp_mb(); /* Prevent any prior operations from leaking in. */ @@ -2380,7 +2377,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rcu_barrier_mutex); + mutex_unlock(&rsp->barrier_mutex); destroy_rcu_head_on_stack(&rd.barrier_head); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d1ca4424122b..7641aec3e59c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -400,6 +400,7 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ raw_spinlock_t fqslock; /* Only one task forcing */ -- cgit v1.2.3 From cfed0a85dad921c683e9c0d25b072bcc5745ede0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Jun 2012 18:22:05 -0700 Subject: rcu: Remove needless initialization For global variables, C defaults all fields to zero. The initialization of the rcu_state structure's ->n_force_qs and ->n_force_qs_ngp fields is therefore redundant, so this commit removes these initializations. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ff992ac24e73..44a8fda9be86 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -73,8 +73,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ .name = #sname, \ } -- cgit v1.2.3 From cf3a9c4842b1e097dbe0854933c471d43dd24f69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 14:56:46 -0700 Subject: rcu: Increase rcu_barrier() concurrency The traditional rcu_barrier() implementation has serialized all requests, regardless of RCU flavor, and also does not coalesce concurrent requests. In the past, this has been good and sufficient. However, systems are getting larger and use of rcu_barrier() has been increasing. This commit therefore introduces a counter-based scheme that allows _rcu_barrier() calls for the same flavor of RCU to take advantage of each others' work. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 36 +++++++++++++++++++++++++++++++++++- kernel/rcutree.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 44a8fda9be86..6bb5d562253f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2294,13 +2294,41 @@ static void _rcu_barrier(struct rcu_state *rsp) unsigned long flags; struct rcu_data *rdp; struct rcu_data rd; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; init_rcu_head_on_stack(&rd.barrier_head); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - smp_mb(); /* Prevent any prior operations from leaking in. */ + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = ACCESS_ONCE(rsp->n_barrier_done); + if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } + + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* * Initialize the count to one rather than to zero in order to @@ -2371,6 +2399,12 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + smp_mb(); /* Keep increment before caller's subsequent code. */ + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7641aec3e59c..be10286ad380 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -403,6 +403,8 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ + unsigned long n_barrier_done; /* ++ at start and end of */ + /* _rcu_barrier(). */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ -- cgit v1.2.3 From a83eff0a82a7f3f14fea477fd41e6c082e7fc96a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 May 2012 18:47:05 -0700 Subject: rcu: Add tracing for _rcu_barrier() This commit adds event tracing for _rcu_barrier() execution. This is defined only if RCU_TRACE=y. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6bb5d562253f..dda43d826504 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2259,6 +2259,17 @@ static int rcu_cpu_has_callbacks(int cpu) rcu_preempt_cpu_has_callbacks(cpu); } +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); +} + /* * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). @@ -2268,8 +2279,12 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); struct rcu_state *rsp = rdp->rsp; - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } } /* @@ -2280,6 +2295,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -2298,6 +2314,7 @@ static void _rcu_barrier(struct rcu_state *rsp) unsigned long snap_done; init_rcu_head_on_stack(&rd.barrier_head); + _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); @@ -2315,7 +2332,9 @@ static void _rcu_barrier(struct rcu_state *rsp) * value up to the next even number and adds two before comparing. */ snap_done = ACCESS_ONCE(rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "Check", -1, snap_done); if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -2328,6 +2347,7 @@ static void _rcu_barrier(struct rcu_state *rsp) */ ACCESS_ONCE(rsp->n_barrier_done)++; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* @@ -2364,13 +2384,19 @@ static void _rcu_barrier(struct rcu_state *rsp) preempt_disable(); rdp = per_cpu_ptr(rsp->rda, cpu); if (cpu_is_offline(cpu)) { + _rcu_barrier_trace(rsp, "Offline", cpu, + rsp->n_barrier_done); preempt_enable(); while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); preempt_enable(); } } @@ -2403,6 +2429,7 @@ static void _rcu_barrier(struct rcu_state *rsp) smp_mb(); /* Keep increment after above mechanism. */ ACCESS_ONCE(rsp->n_barrier_done)++; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ -- cgit v1.2.3 From d7e187c8e9f30543f9cadfed094896ff414acb8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2012 11:49:15 -0700 Subject: rcu: Add rcu_barrier() statistics to debugfs tracing This commit adds an rcubarrier file to RCU's debugfs statistical tracing directory, providing diagnostic information on rcu_barrier(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_trace.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a3556a2d842c..057408be6c3b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,40 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp) +{ + seq_printf(m, "%c bcc: %d nbd: %lu\n", + rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); +} + +static int show_rcubarrier(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt: "); + print_rcubarrier(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched: "); + print_rcubarrier(m, &rcu_sched_state); + seq_puts(m, "rcu_bh: "); + print_rcubarrier(m, &rcu_bh_state); + return 0; +} + +static int rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcubarrier, NULL); +} + +static const struct file_operations rcubarrier_fops = { + .owner = THIS_MODULE, + .open = rcubarrier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -453,6 +487,11 @@ static int __init rcutree_trace_init(void) if (!rcudir) goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, rcudir, + NULL, &rcubarrier_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); if (!retval) -- cgit v1.2.3 From 1bca8cf1a2c3c6683b12ad28a3e826ca7a834978 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 09:40:38 -0700 Subject: rcu: Remove unneeded __rcu_process_callbacks() argument With the advent of __this_cpu_ptr(), it is no longer necessary to pass both the rcu_state and rcu_data structures into __rcu_process_callbacks(). This commit therefore computes the rcu_data pointer from the rcu_state pointer within __rcu_process_callbacks() so that callers can pass in only the pointer to the rcu_state structure. This paves the way for linking the rcu_state structures together and iterating over them. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- kernel/rcutree_plugin.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dda43d826504..5376a156be8a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1788,9 +1788,10 @@ unlock_fqs_ret: * whom the rdp belongs. */ static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -1827,9 +1828,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_process_callbacks(struct softirq_action *unused) { trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + __rcu_process_callbacks(&rcu_sched_state); + __rcu_process_callbacks(&rcu_bh_state); rcu_preempt_process_callbacks(); trace_rcu_utilization("End RCU core"); } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 9cb3a68819fa..5a80cdd9a0a3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -687,8 +687,7 @@ static void rcu_preempt_check_callbacks(int cpu) */ static void rcu_preempt_process_callbacks(void) { - __rcu_process_callbacks(&rcu_preempt_state, - &__get_cpu_var(rcu_preempt_data)); + __rcu_process_callbacks(&rcu_preempt_state); } #ifdef CONFIG_RCU_BOOST -- cgit v1.2.3 From 6ce75a2326e6f8b3bdfb60e1de7934b89858e87b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 11:01:13 -0700 Subject: rcu: Introduce for_each_rcu_flavor() and use it The arrival of TREE_PREEMPT_RCU some years back included some ugly code involving either #ifdef or #ifdef'ed wrapper functions to iterate over all non-SRCU flavors of RCU. This commit therefore introduces a for_each_rcu_flavor() iterator over the rcu_state structures for each flavor of RCU to clean up a bit of the ugliness. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 53 +++++++++++++--------- kernel/rcutree.h | 12 +++-- kernel/rcutree_plugin.h | 116 ------------------------------------------------ 3 files changed, 37 insertions(+), 144 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5376a156be8a..b61c3ffc80e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,6 +84,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; @@ -860,9 +861,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) */ void rcu_cpu_stall_reset(void) { - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } static struct notifier_block rcu_panic_block = { @@ -1827,10 +1829,11 @@ __rcu_process_callbacks(struct rcu_state *rsp) */ static void rcu_process_callbacks(struct softirq_action *unused) { + struct rcu_state *rsp; + trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state); - __rcu_process_callbacks(&rcu_bh_state); - rcu_preempt_process_callbacks(); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); trace_rcu_utilization("End RCU core"); } @@ -2241,9 +2244,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; } /* @@ -2253,10 +2259,13 @@ static int rcu_pending(int cpu) */ static int rcu_cpu_has_callbacks(int cpu) { + struct rcu_state *rsp; + /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_cpu_has_callbacks(cpu); + for_each_rcu_flavor(rsp) + if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) + return 1; + return 0; } /* @@ -2551,9 +2560,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) static void __cpuinit rcu_prepare_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); } /* @@ -2565,6 +2576,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2589,18 +2601,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_cleanup_dying_cpu(&rcu_bh_state); - rcu_cleanup_dying_cpu(&rcu_sched_state); - rcu_preempt_cleanup_dying_cpu(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); - rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); - rcu_preempt_cleanup_dead_cpu(cpu); + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); break; default: break; @@ -2717,6 +2727,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } + list_add(&rsp->flavors, &rcu_struct_flavors); } /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index be10286ad380..b92c4550a6e6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -422,8 +422,13 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + struct list_head flavors; /* List of RCU flavors. */ }; +extern struct list_head rcu_struct_flavors; +#define for_each_rcu_flavor(rsp) \ + list_for_each_entry((rsp), &rcu_struct_flavors, flavors) + /* Return values for rcu_preempt_offline_tasks(). */ #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ @@ -466,25 +471,18 @@ static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); -static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_cpu_has_callbacks(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5a80cdd9a0a3..d18b4d383afe 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -544,16 +544,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -/* - * Suppress preemptible RCU's CPU stall warnings by pushing the - * time of the next stall-warning message comfortably far into the - * future. - */ -static void rcu_preempt_stall_reset(void) -{ - rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; -} - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace @@ -654,14 +644,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Do CPU-offline processing for preemptible RCU. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ - rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); -} - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -682,14 +664,6 @@ static void rcu_preempt_check_callbacks(int cpu) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_state); -} - #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void) @@ -921,24 +895,6 @@ mb_ret: } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -/* - * Check to see if there is any immediate preemptible-RCU-related work - * to be done. - */ -static int rcu_preempt_pending(int cpu) -{ - return __rcu_pending(&rcu_preempt_state, - &per_cpu(rcu_preempt_data, cpu)); -} - -/* - * Does preemptible RCU have callbacks on this CPU? - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return !!per_cpu(rcu_preempt_data, cpu).nxtlist; -} - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. */ @@ -948,23 +904,6 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Initialize preemptible RCU's per-CPU data. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ - rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); -} - -/* - * Move preemptible RCU's callbacks from dying CPU to other online CPU - * and record a quiescent state. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ - rcu_cleanup_dying_cpu(&rcu_preempt_state); -} - /* * Initialize preemptible RCU's state structures. */ @@ -1049,14 +988,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return 0; } -/* - * Because preemptible RCU does not exist, there is no need to suppress - * its CPU stall warnings. - */ -static void rcu_preempt_stall_reset(void) -{ -} - /* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for @@ -1084,14 +1015,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Because preemptible RCU does not exist, it never needs CPU-offline - * processing. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1100,14 +1023,6 @@ static void rcu_preempt_check_callbacks(int cpu) { } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - /* * Queue an RCU callback for lazy invocation after a grace period. * This will likely be later named something like "call_rcu_lazy()", @@ -1148,22 +1063,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -/* - * Because preemptible RCU does not exist, it never has any work to do. - */ -static int rcu_preempt_pending(int cpu) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, it never has callbacks - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return 0; -} - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). @@ -1174,21 +1073,6 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Because preemptible RCU does not exist, there is no per-CPU - * data to initialize. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ -} - -/* - * Because there is no preemptible RCU, there is no cleanup to do. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ -} - /* * Because preemptible RCU does not exist, it need not be initialized. */ -- cgit v1.2.3 From c0cc962da3e7770feb3665f087ea3e23d8c15479 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 12:25:39 -0700 Subject: rcu: Use for_each_rcu_flavor() in TREE_RCU tracing This commit applies the new for_each_rcu_flavor() macro to the kernel/rcutree_trace.c file. Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 116 ++++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 057408be6c3b..a16ddbd6fdc4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,24 +46,15 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp) -{ - seq_printf(m, "%c bcc: %d nbd: %lu\n", - rsp->rcu_barrier_in_progress ? 'B' : '.', - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); -} - static int show_rcubarrier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt: "); - print_rcubarrier(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched: "); - print_rcubarrier(m, &rcu_sched_state); - seq_puts(m, "rcu_bh: "); - print_rcubarrier(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", + rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); return 0; } @@ -129,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -#define PRINT_RCU_DATA(name, func, m) \ - do { \ - int _p_r_d_i; \ - \ - for_each_possible_cpu(_p_r_d_i) \ - func(m, &per_cpu(name, _p_r_d_i)); \ - } while (0) - static int show_rcudata(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); - seq_puts(m, "rcu_bh:\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + int cpu; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -200,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { + int cpu; + struct rcu_state *rsp; + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); @@ -207,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "\"rcu_preempt:\"\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "\"rcu_sched:\"\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); - seq_puts(m, "\"rcu_bh:\"\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + for_each_rcu_flavor(rsp) { + seq_printf(m, "\"%s:\"\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -304,9 +287,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->completed, gpnum, rsp->fqs_state, + rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, @@ -329,14 +312,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_one_rcu_state(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_one_rcu_state(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_one_rcu_state(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + print_one_rcu_state(m, rsp); return 0; } @@ -377,11 +356,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) static int show_rcugp(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - show_one_rcugp(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - show_one_rcugp(m, &rcu_sched_state); - show_one_rcugp(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + show_one_rcugp(m, rsp); return 0; } @@ -416,28 +394,20 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_need_nothing); } -static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; struct rcu_data *rdp; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } } -} - -static int show_rcu_pending(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_rcu_pendings(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_rcu_pendings(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_rcu_pendings(m, &rcu_bh_state); return 0; } -- cgit v1.2.3 From ff015030c939f0bec68fa9b8898da3aaa7fe55ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jun 2012 13:16:58 -0700 Subject: rcu: RCU_SAVE_DYNTICK code no longer ever dead Before RCU had unified idle, the RCU_SAVE_DYNTICK leg of the switch statement in force_quiescent_state() was dead code for CONFIG_NO_HZ=n kernel builds. With unified idle, the code is never dead. This commit therefore removes the "if" statement designed to make gcc aware of when the code was and was not dead. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b61c3ffc80e9..967b4bed2cf3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1747,8 +1747,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -- cgit v1.2.3 From 751a68b2e46fe11a42450cb55a16e8065eddec7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2012 13:44:44 -0700 Subject: rcu: Rationalize ordering of torture_ops list Move the raw SRCU interfaces out of the middle of the normal SRCU interfaces. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34ab7555..9850479f319d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1908,8 +1908,8 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, - &srcu_raw_sync_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, + &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3 From e3f8d3788ed7cf55946030dc9b76e73edb111602 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2012 10:21:50 -0700 Subject: rcu: Test srcu_barrier() from rcutorture test suite SRCU now has a call_srcu() and an srcu_barrier(), but rcutorture does not test them. This commit adds the machinery to allow rcutorture's existing tests for call_rcu() and rcu_barrier() to apply to the SRCU equivalents. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9850479f319d..7b6935e0cee3 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -635,6 +635,17 @@ static void srcu_torture_synchronize(void) synchronize_srcu(&srcu_ctl); } +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + static int srcu_torture_stats(char *page) { int cnt = 0; @@ -661,8 +672,8 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; -- cgit v1.2.3 From c6ebcbb60c8c68a88160fe54302e851700d1362c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2012 19:21:41 -0700 Subject: rcu: Fix bug in rcu_barrier() torture test The child threads in the rcu_torture_barrier_cbs() are improperly synchronized, which can cause the rcu_barrier() tests to hang. The failure mode is as follows: 1. CPU 0 running in rcu_torture_barrier() sets barrier_cbs_count to n_barrier_cbs. 2. CPU 1 running in rcu_torture_barrier_cbs() wakes up, posts its RCU callback, and atomically decrements barrier_cbs_count. Because barrier_cbs_count is not zero, it does not do the wake_up(). 3. CPU 2 running in rcu_torture_barrier_cbs() wakes up, but finds that barrier_cbs_count is not equal to n_barrier_cbs, and so returns to sleep. 4. The value of barrier_cbs_count therefore never reaches zero, which causes the test to hang. This commit therefore uses a phase variable to coordinate the test, preventing this scenario from occurring. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7b6935e0cee3..f7fe73e59c9f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -206,6 +206,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); @@ -1642,6 +1643,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; + bool lastphase = 0; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1649,9 +1651,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - atomic_read(&barrier_cbs_count) == n_barrier_cbs || + barrier_phase != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); + lastphase = barrier_phase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); @@ -1676,7 +1680,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - /* wake_up() path contains the required barriers. */ + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, -- cgit v1.2.3 From 143aa672f4fc643420c8325ad09c379ed33a27cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 May 2012 18:17:48 -0700 Subject: rcu: Fix diagnostic-printk typo in rcutorture The rcu_torture_barrier() function has a copy-and-paste typo in the string passed to rcutorture_shutdown_absorb(), which this commit fixes. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index f7fe73e59c9f..045a3dc233ea 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1700,7 +1700,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; -- cgit v1.2.3 From 72472a02a9c4507ef54d03d71bb253c26015f52c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 May 2012 17:50:51 -0700 Subject: rcu: Make rcutorture fakewriters invoke rcu_barrier() The current rcutorture rcu_barrier() testing never intentionally runs more than one instance of rcu_barrier() at a given time. This fails to test the the shiny new concurrency features of rcu_barrier(). This commit therefore modifies the rcutorture fakewriter kthread to randomly invoke rcu_barrier() rather than the usual synchronize_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 045a3dc233ea..c279ee920947 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1025,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); + if (cur_ops->cb_barrier != NULL && + rcu_random(&rand) % (nfakewriters * 8) == 0) + cur_ops->cb_barrier(); + else + cur_ops->sync(); rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); -- cgit v1.2.3 From 285fe29481d865ae381ad3924c80894e6968c2d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 08:45:12 -0700 Subject: rcu: Fix detection of abruptly-ending stall The code that attempts to identify stalls that end just as we detect them is broken by both flavors of initialization failure. This commit therefore properly initializes and computes the count of the number of reasons why the RCU grace period is stalled. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..dc8c5284fe06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -733,7 +733,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; - int ndetected; + int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -774,7 +774,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) */ rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected = rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); -- cgit v1.2.3 From 3f5d3ea64f1783f0d4ea0d35890ae3297f045a8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:39:56 -0700 Subject: rcu: Consolidate duplicate callback-list initialization There are a couple of open-coded initializations of the rcu_data structure's RCU callback list. This commit therefore consolidates them into a new init_callback_list() function. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dc8c5284fe06..81e0394e46af 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) return ret; } +/* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + int i; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + /* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp @@ -1328,8 +1340,6 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; - /* * Orphan the callbacks. First adjust the counts. This is safe * because ->onofflock excludes _rcu_barrier()'s adoption of @@ -1369,9 +1379,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, } /* Finally, initialize the rcu_data structure's list to empty. */ - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); } /* @@ -2407,16 +2415,13 @@ static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -- cgit v1.2.3 From 1d1fb395f6dbc07b36285bbedcf01a73b57f7cb5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:44:42 -0700 Subject: rcu: Add ACCESS_ONCE() to ->qlen accesses The _rcu_barrier() function accesses other CPUs' rcu_data structure's ->qlen field without benefit of locking. This commit therefore adds the required ACCESS_ONCE() wrappers around accesses and updates that need it. ACCESS_ONCE() is not needed when a CPU accesses its own ->qlen, or in code that cannot run while _rcu_barrier() is sampling ->qlen fields. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 81e0394e46af..89addada3e3a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1350,7 +1350,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; } /* @@ -1600,7 +1600,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; + ACCESS_ONCE(rdp->qlen) -= count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -1889,7 +1889,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - rdp->qlen++; + ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; else @@ -2423,7 +2423,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); init_callback_list(rdp); rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -- cgit v1.2.3 From 2a3fa843b555d202e682bf08c65ee1a4a93c79cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 May 2012 11:58:36 -0700 Subject: rcu: Consolidate tree/tiny __rcu_read_{,un}lock() implementations The CONFIG_TREE_PREEMPT_RCU and CONFIG_TINY_PREEMPT_RCU versions of __rcu_read_lock() and __rcu_read_unlock() are identical, so this commit consolidates them into kernel/rcupdate.h. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutiny_plugin.h | 47 +---------------------------------------------- kernel/rcutree_plugin.h | 47 +---------------------------------------------- 3 files changed, 46 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41ce1e9..4e6a61b15e86 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,6 +53,50 @@ #ifdef CONFIG_PREEMPT_RCU +/* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..a269b0da0eb6 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -526,24 +525,12 @@ void rcu_preempt_note_context_switch(void) local_irq_restore(flags); } -/* - * Tiny-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -626,38 +613,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); } -/* - * Tiny-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the rcu_preempt_ctrlblk structure, which is diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..4b6b17cdf66b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -78,7 +78,6 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -232,18 +231,6 @@ static void rcu_preempt_note_context_switch(int cpu) local_irq_restore(flags); } -/* - * Tree-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - /* * Check for preempted RCU readers blocking the current grace period * for the specified rcu_node structure. If the caller needs a reliable @@ -310,7 +297,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -418,38 +405,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } } -/* - * Tree-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* -- cgit v1.2.3 From 62fde6edf12b60fddb13a3f0a779c8be0bb7447e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2012 22:10:24 -0700 Subject: rcu: Make __call_rcu() handle invocation from idle Although __call_rcu() is handled correctly when called from a momentary non-idle period, if it is called on a CPU that RCU believes to be idle on RCU_FAST_NO_HZ kernels, the callback might be indefinitely postponed. This commit therefore ensures that RCU is aware of the new callback and has a chance to force the CPU out of dyntick-idle mode when a new callback is posted. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89addada3e3a..a4a9c916ad36 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -585,8 +585,6 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -#ifdef CONFIG_PROVE_RCU - /** * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * @@ -604,7 +602,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* * Is the current CPU online? Disable preemption to avoid false positives @@ -645,9 +643,7 @@ bool rcu_lockdep_current_cpu_online(void) } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /** * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle @@ -1904,6 +1900,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), else trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (rcu_is_cpu_idle()) + invoke_rcu_core(); + /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { local_irq_restore(flags); -- cgit v1.2.3 From a16b7a693430406dc229ab0c6b154f669a2031c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 26 May 2012 08:56:01 -0700 Subject: rcu: Prevent __call_rcu() from invoking RCU core on offline CPUs The __call_rcu() function will invoke the RCU core, for example, if it detects that the current CPU has too many callbacks. However, this can happen on an offline CPU that is on its way to the idle loop, in which case it is an error to invoke the RCU core, and the excess callbacks will be adopted in any case. This commit therefore adds checks to __call_rcu() for running on an offline CPU, refraining from invoking the RCU core in this case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a4a9c916ad36..ceaa95923a87 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1904,11 +1904,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (rcu_is_cpu_idle()) + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) invoke_rcu_core(); - /* If interrupts were disabled, don't dive into RCU core. */ - if (irqs_disabled_flags(flags)) { + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) { local_irq_restore(flags); return; } -- cgit v1.2.3 From 29154c57e35a191c83b19c61b1935c9f21957662 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 May 2012 03:21:48 -0700 Subject: rcu: Split RCU core processing out of __call_rcu() The __call_rcu() function is a bit overweight, so this commit splits it into actual enqueuing of and accounting for the callback (__call_rcu()) and associated RCU-core processing (__call_rcu_core()). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 90 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ceaa95923a87..70c4da7d2a97 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1861,45 +1861,12 @@ static void invoke_rcu_core(void) raise_softirq(RCU_SOFTIRQ); } -static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp, bool lazy) +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, + struct rcu_head *head, unsigned long flags) { - unsigned long flags; - struct rcu_data *rdp; - - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - smp_mb(); /* Ensure RCU update seen before callback registry. */ - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ - local_irq_save(flags); - rdp = this_cpu_ptr(rsp->rda); - - /* Add the callback to our list. */ - ACCESS_ONCE(rdp->qlen)++; - if (lazy) - rdp->qlen_lazy++; - else - rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); - else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); - /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -1908,10 +1875,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ - if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) { - local_irq_restore(flags); + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) return; - } /* * Force the grace period if too many callbacks or too long waiting. @@ -1944,6 +1909,49 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), } } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); +} + +static void +__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp, bool lazy) +{ + unsigned long flags; + struct rcu_data *rdp; + + WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + smp_mb(); /* Ensure RCU update seen before callback registry. */ + + /* + * Opportunistically note grace-period endings and beginnings. + * Note that we might see a beginning right after we see an + * end, but never vice versa, since this CPU has to pass through + * a quiescent state betweentimes. + */ + local_irq_save(flags); + rdp = this_cpu_ptr(rsp->rda); + + /* Add the callback to our list. */ + ACCESS_ONCE(rdp->qlen)++; + if (lazy) + rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); + smp_mb(); /* Count before adding callback for rcu_barrier(). */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + + if (__is_kfree_rcu_offset((unsigned long)func)) + trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + + /* Go handle any RCU core processing required. */ + __call_rcu_core(rsp, rdp, head, flags); local_irq_restore(flags); } -- cgit v1.2.3 From d7118175cce7e76b3292b60832813ef1f28b6523 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Jun 2012 07:12:43 -0700 Subject: rcu: Fix rcu_is_cpu_idle() #ifdef in TINY_RCU The rcu_is_cpu_idle() function is used if CONFIG_DEBUG_LOCK_ALLOC, but TINY_RCU defines it only when CONFIG_PROVE_RCU. This causes build failures when CONFIG_DEBUG_LOCK_ALLOC=y but CONFIG_PROVE_RCU=n. This commit therefore adjusts the #ifdefs for rcu_is_cpu_idle() so that it is defined when CONFIG_DEBUG_LOCK_ALLOC=y. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444204d2..547b1fe5b052 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -172,7 +172,7 @@ void rcu_irq_enter(void) local_irq_restore(flags); } -#ifdef CONFIG_PROVE_RCU +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Test whether RCU thinks that the current CPU is idle. @@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Test whether the current CPU was interrupted from idle. Nested -- cgit v1.2.3 From 1c17e4d4437d8045a596d9f06c1558dc09e2b372 Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Tue, 19 Jun 2012 10:43:16 +0200 Subject: rcu: Prevent uninitialized string in RCU CPU stall info An uninitialized string may be displayed at the end of the rcu_preempt detected stall info such as 0: (1 GPs behind) idle=075/140000000000000/0 =8?^D=8?^D ^^^^^^^^^^ if CONFIG_RCU_FAST_NO_HZ is not defined. This trivial patch clears the string in this case. Signed-off-by: Carsten Emde Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b6b17cdf66b..395cdd1e0634 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2224,6 +2224,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { + *cp = '\0'; } #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3 From 95f0c1de3e6ed4383cc4b5f52ce4ecfb21026b49 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2012 11:58:27 -0700 Subject: rcu: Disable preemption in rcu_blocking_is_gp() It is time to optimize CONFIG_TREE_PREEMPT_RCU's synchronize_rcu() for uniprocessor optimization, which means that rcu_blocking_is_gp() can no longer rely on RCU read-side critical sections having disabled preemption. This commit therefore disables preemption across rcu_blocking_is_gp()'s scan of the cpu_online_mask. (Updated from previous version to fix embarrassing bug spotted by Wu Fengguang.) Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 70c4da7d2a97..e000a623e635 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1981,28 +1981,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * occasionally incorrectly indicate that there are multiple CPUs online * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. - * - * Of course, sampling num_online_cpus() with preemption enabled can - * give erroneous results if there are concurrent CPU-hotplug operations. - * For example, given a demonic sequence of preemptions in num_online_cpus() - * and CPU-hotplug operations, there could be two or more CPUs online at - * all times, but num_online_cpus() might well return one (or even zero). - * - * However, all such demonic sequences require at least one CPU-offline - * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer - * is only a problem if there is an RCU read-side critical section executing - * throughout. But RCU-sched and RCU-bh read-side critical sections - * disable either preemption or bh, which prevents a CPU from going offline. - * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return - * that there is only one CPU when in fact there was more than one throughout - * is when there were no RCU readers in the system. If there are no - * RCU readers, the grace period by definition can be of zero length, - * regardless of the number of online CPUs. */ static inline int rcu_blocking_is_gp(void) { + int ret; + might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() <= 1; + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; } /** -- cgit v1.2.3 From cf01537ecf192d9ff15c32a355db5d5af22bea4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 11:26:42 -0700 Subject: rcu: Add check for CPUs going offline with callbacks queued If a CPU goes offline with callbacks queued, those callbacks might be indefinitely postponed, which can result in a system hang. This commit therefore inserts warnings for this condition. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e000a623e635..95c7b61e77e4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1509,6 +1509,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp, true); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", + cpu, rdp->qlen, rdp->nxtlist); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From bf1304e9cd755be7814ac8365834b7a1d0f06b58 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 15:55:39 -0700 Subject: rcu: Dump only the current CPU's buffers for idle-entry/exit warnings Problems in RCU idle entry and exit are almost always confined to the offending CPU. This commit therefore switches ftrace_dump() from DUMP_ALL to DUMP_ORIG. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..6eb48f13eeeb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -358,7 +358,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -468,7 +468,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ -- cgit v1.2.3 From 28f8555364ef5b54b21251c5c8022109a70626e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 15:37:48 -0700 Subject: rcu: The rcu_needs_cpu() function is not a quiescent state The TINY_PREEMPT_RCU() function rcu_preempt_needs_cpu(), which is called from rcu_needs_cpu(), assumes that it is in a quiescent state with respect to the CPU. This is no longer the case. This commit therefore updates rcu_preempt_needs_cpu() to make it aware that it is not running in a quiescent state. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutiny_plugin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..849ede51c12d 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -846,8 +846,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ int rcu_preempt_needs_cpu(void) { - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } -- cgit v1.2.3 From e84c48ae3024ac2f14ed1c3671e5ea37c60fb838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Jun 2012 20:45:10 -0700 Subject: rcu: Round FAST_NO_HZ lazy timeout to nearest second Currently, if several CPUs in the same package have all lazy RCU callbacks, their wakeups will be uncorrelated. If all the CPUs are in the same power domain (as is often the case), this will result in unnecessary power-ups of the package. This commit therefore uses round_jiffies() to round the timeouts to a second boundary, increasing the odds that they can be coalesced with each other or with other timeouts. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..c28d25522a81 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1968,7 +1968,7 @@ static void rcu_idle_count_callbacks_posted(void) */ #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ -#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ /* @@ -2047,10 +2047,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) return 1; } /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) - *delta_jiffies = RCU_IDLE_GP_DELAY; - else - *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, + RCU_IDLE_GP_DELAY) - jiffies; + } else { + *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; + *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + } return 0; } @@ -2187,10 +2190,11 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_GP_DELAY; + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); } else { rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_LAZY_GP_DELAY; + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); trace_rcu_prep_idle("Dyntick with lazy callbacks"); } tp = &rdtp->idle_gp_timer; -- cgit v1.2.3 From 9d2ad24306f2fafc3612e5a216aab31f9e56e879 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 24 Jun 2012 10:15:02 -0700 Subject: rcu: Make RCU_FAST_NO_HZ respect nohz= boot parameter If the nohz= boot parameter disables nohz, then RCU_FAST_NO_HZ needs to also disable itself. This commit therefore checks for tick_nohz_enabled being zero, disabling rcu_prepare_for_idle() if so. This commit assumes that tick_nohz_enabled can change at runtime: If this is not the case, then a simpler approach suffices. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 15 +++++++++++++++ kernel/time/tick-sched.c | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079f..e978845c6800 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -97,6 +97,7 @@ struct rcu_dynticks { /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c28d25522a81..3508000e077d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1971,6 +1971,8 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +extern int tick_nohz_enabled; + /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2112,6 +2114,7 @@ static void rcu_cleanup_after_idle(int cpu) del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); + rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); } /* @@ -2137,6 +2140,18 @@ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + int tne; + + /* Handle nohz enablement switches conservatively. */ + tne = ACCESS_ONCE(tick_nohz_enabled); + if (tne != rdtp->tick_nohz_enabled_snap) { + if (rcu_cpu_has_callbacks(cpu)) + invoke_rcu_core(); /* force nohz to see update. */ + rdtp->tick_nohz_enabled_snap = tne; + return; + } + if (!tne) + return; /* * If this is an idle re-entry, for example, due to use of diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..66ff07f6184c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; /* * Enable / Disable tickless mode -- cgit v1.2.3 From 164c33c6adee609b8b9062cce4c10f764d0dce13 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Mon, 25 Jun 2012 18:18:15 -0700 Subject: sched: Fix fork() error path to not crash In dup_task_struct(), if arch_dup_task_struct() fails, the clean up code fails to clean up correctly. That's because the clean up code depends on unininitalized ti->task pointer. We fix this by making sure that the task and thread_info know about each other before we attempt to take the error path. Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120626011815.11323.5533.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; + /* + * We defer looking at err, because we will need this setup + * for the clean up path to work correctly. + */ tsk->stack = ti; - setup_thread_stack(tsk, orig); + + if (err) + goto out; + clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); -- cgit v1.2.3 From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 15:52:09 +0200 Subject: sched/nohz: Rewrite and fix load-avg computation -- again Thanks to Charles Wang for spotting the defects in the current code: - If we go idle during the sample window -- after sampling, we get a negative bias because we can negate our own sample. - If we wake up during the sample window we get a positive bias because we push the sample to a known active period. So rewrite the entire nohz load-avg muck once again, now adding copious documentation to the code. Reported-and-tested-by: Doug Smythies Reported-and-tested-by: Charles Wang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins [ minor edits ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 275 ++++++++++++++++++++++++++++++++++------------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 2 - kernel/time/tick-sched.c | 2 + 4 files changed, 205 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..bb840405335d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; /* - * Its got a race, we don't care... + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2455,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,13 +2534,16 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } +/* + * End of global load-average stuff + */ + /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3 From 74a7f08448adea6cb47cd9b260c98ff168117e92 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 15 Jun 2012 11:50:25 -0600 Subject: devicetree: add helper inline for retrieving a node's full name The pattern (np ? np->full_name : "") is rather common in the kernel, but can also make for quite long lines. This patch adds a new inline function, of_node_full_name() so that the test for a valid node pointer doesn't need to be open coded at all call sites. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Signed-off-by: Rob Herring --- kernel/irq/irqdomain.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 41c1564103f1..38c5eb839c92 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -448,7 +448,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + hwirq, of_node_full_name(domain->of_node), virq); return virq; } @@ -477,7 +477,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return intspec[0]; #endif pr_warning("no irq domain found for %s !\n", - controller->full_name); + of_node_full_name(controller)); return 0; } @@ -725,8 +725,8 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain && desc->irq_data.domain->of_node) - p = desc->irq_data.domain->of_node->full_name; + if (desc->irq_data.domain) + p = of_node_full_name(desc->irq_data.domain->of_node); else p = none; seq_printf(m, "%s\n", p); -- cgit v1.2.3 From cfca927972e31a5b3da49bf641c525732ff3c357 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Jun 2012 12:54:17 -0700 Subject: rcu: Introduce check for callback list/count mismatch The recent bug that introduced the RCU callback list/count mismatch showed the need for a diagnostic to check for this, which this commit adds. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 95c7b61e77e4..4154c9567a6d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1612,6 +1612,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = rdp->qlen; + WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); local_irq_restore(flags); -- cgit v1.2.3 From c701d5d9b384ff03ceb232ef21236364d784a411 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jun 2012 08:08:25 -0700 Subject: rcu: Fix code-style issues involving "else" The Linux kernel coding style says that single-statement blocks should omit curly braces unless the other leg of the "if" statement has multiple statements, in which case the curly braces should be included. This commit fixes RCU's violations of this rule. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 7 ++++--- kernel/rcutorture.c | 3 ++- kernel/rcutree.c | 7 ++++--- kernel/rcutree_plugin.h | 14 ++++++++------ 4 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 116725b5edfb..918fd1e8509c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -350,8 +350,9 @@ static int rcu_initiate_boost(void) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_callbacks(); - } else + } else { RCU_TRACE(rcu_initiate_boost_trace()); + } return 1; } @@ -778,9 +779,9 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = NULL; /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) + if (!rcu_preempted_readers_exp()) { local_irq_restore(flags); - else { + } else { rcu_initiate_boost(); local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c279ee920947..155fb129b641 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -408,8 +408,9 @@ rcu_torture_cb(struct rcu_head *p) if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; rcu_torture_free(rp); - } else + } else { cur_ops->deferred_free(rp); + } } static int rcu_no_completed(void) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 117218a43724..f280e542e3e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -892,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; rdp->passed_quiesce = 0; - } else + } else { rdp->qs_pending = 0; + } zero_cpu_stall_ticks(rdp); } } @@ -2130,9 +2131,9 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_sched(); return; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a9194d5606c4..7f3244c0df01 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -390,8 +390,9 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grphi, !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); - } else + } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); + } #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ @@ -757,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); - if (list_empty(&rnp->blkd_tasks)) + if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - else { + } else { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ must_wait = 1; @@ -803,9 +804,9 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_rcu(); return; } @@ -2093,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_callbacks(cpu)) { trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else + } else { trace_rcu_prep_idle("Callbacks drained"); + } } /* -- cgit v1.2.3 From 5cf05ad758c30d17ff23c2be346b5de982bc2121 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2012 15:12:45 -0700 Subject: rcu: Fix broken strings in RCU's source code. Although the C language allows you to break strings across lines, doing this makes it hard for people to find the Linux kernel code corresponding to a given console message. This commit therefore fixes broken strings throughout RCU's source code. Suggested-by: Josh Triplett Suggested-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 33 ++++++++++++++------------------- kernel/rcutree_trace.c | 25 ++++++++++++------------- 2 files changed, 26 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 155fb129b641..25b15033c61f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -49,8 +49,7 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and " - "Josh Triplett "); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -1200,27 +1199,27 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld " - "barrier: %ld/%ld:%ld", + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror, + n_rcu_torture_boost_rterror); + cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers, + n_rcu_torture_timers); + cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts, + n_offline_attempts); + cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); @@ -1462,8 +1461,7 @@ rcu_torture_shutdown(void *arg) delta = shutdown_time - jiffies_snap; if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", + "rcu_torture_shutdown task: %lu jiffies remaining\n", torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); @@ -1515,8 +1513,7 @@ rcu_torture_onoff(void *arg) if (cpu_down(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", + "rcu_torture_onoff task: offlined %d\n", torture_type, cpu); n_offline_successes++; } @@ -1529,8 +1526,7 @@ rcu_torture_onoff(void *arg) if (cpu_up(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", + "rcu_torture_onoff task: onlined %d\n", torture_type, cpu); n_online_successes++; } @@ -1952,8 +1948,7 @@ rcu_torture_init(void) return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " - "fqs_duration, fqs disabled.\n"); + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a16ddbd6fdc4..abffb486e94e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -218,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) { - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " - "j=%04x bt=%04x\n", + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", rnp->grplo, rnp->grphi, "T."[list_empty(&rnp->blkd_tasks)], "N."[!rnp->gp_tasks], @@ -227,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) "B."[!rnp->boost_tasks], convert_kthread_status(rnp->boost_kthread_status), rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts, + rnp->n_normal_boosts); + seq_printf(m, "j=%04x bt=%04x\n", (int)(jiffies & 0xffff), (int)(rnp->boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - " balk", + seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", rnp->n_balk_blkd_tasks, rnp->n_balk_exp_gp_tasks, rnp->n_balk_boost_tasks, @@ -287,11 +286,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff), + (int)(jiffies & 0xffff)); + seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); @@ -378,16 +377,16 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { - seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld rpq=%ld cbr=%ld cng=%ld " - "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending, + rdp->n_rcu_pending); + seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", rdp->n_rp_qs_pending, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp, + rdp->n_rp_cpu_needs_gp); + seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, rdp->n_rp_need_fqs, -- cgit v1.2.3 From 5c53d819c71c63fdc91f30a59164583f68e2d63a Mon Sep 17 00:00:00 2001 From: liu chuansheng Date: Fri, 6 Jul 2012 09:50:08 -0700 Subject: printk: replacing the raw_spin_lock/unlock with raw_spin_lock/unlock_irq In function devkmsg_read/writev/llseek/poll/open()..., the function raw_spin_lock/unlock is used, there is potential deadlock case happening. CPU1: thread1 doing the cat /dev/kmsg: raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { when thread1 run here, at this time one interrupt is coming on CPU1 and running based on this thread,if the interrupt handle called the printk which need the logbuf_lock spin also, it will cause deadlock. So we should use raw_spin_lock/unlock_irq here. Acked-by: Kay Sievers Signed-off-by: liu chuansheng Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index dba18211685e..12886cd19cd9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -430,20 +430,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } if (user->seq < log_first_seq) { @@ -451,7 +451,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } @@ -501,7 +501,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (len > count) { ret = -EINVAL; @@ -528,7 +528,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); switch (whence) { case SEEK_SET: /* the first record */ @@ -552,7 +552,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -566,14 +566,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; ret = POLLIN|POLLRDNORM; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -597,10 +597,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); file->private_data = user; return 0; -- cgit v1.2.3 From e3f5a5f27153228569f3396049838e9727dae86e Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: escape the backslash character while exporting data Non-printable characters in the log data are hex-escaped to ensure safe post processing. We need to escape a backslash we find in the data, to be able to distinguish it from a backslash we add for the escaping. Also escape the non-printable character 127. Thanks to Miloslav Trmac for the heads up. Reported-by: Michael Neuling Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 12886cd19cd9..505863aa3a7f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -465,7 +465,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, for (i = 0; i < msg->text_len; i++) { unsigned char c = log_text(msg)[i]; - if (c < ' ' || c >= 128) + if (c < ' ' || c >= 127 || c == '\\') len += sprintf(user->buf + len, "\\x%02x", c); else user->buf[len++] = c; @@ -489,7 +489,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, continue; } - if (c < ' ' || c >= 128) { + if (c < ' ' || c >= 127 || c == '\\') { len += sprintf(user->buf + len, "\\x%02x", c); continue; } -- cgit v1.2.3 From 43a73a50b352cd3df25b3ced72033942a6a0f919 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: add the facility number to the syslog prefix After the recent split of facility and level into separate variables, we miss the facility value (always 0 for kernel-originated messages) in the syslog prefix. On Tue, Jul 3, 2012 at 12:45 PM, Dan Carpenter wrote: > Static checkers complain about the impossible condition here. > > In 084681d14e ('printk: flush continuation lines immediately to > console'), we changed msg->level from being a u16 to being an unsigned > 3 bit bitfield. Cc: Dan Carpenter Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 505863aa3a7f..37cde752cb8a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -818,15 +818,18 @@ static size_t print_time(u64 ts, char *buf) static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; if (syslog) { if (buf) { - len += sprintf(buf, "<%u>", msg->level); + len += sprintf(buf, "<%u>", prefix); } else { len += 3; - if (msg->level > 9) - len++; - if (msg->level > 99) + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) len++; } } -- cgit v1.2.3 From cb424ffe9f45ad80267f2a98fbd9bf21caa0ce22 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: properly handle concurrent non-blocking read() from /proc/kmsg The /proc/kmsg read() interface is internally simply wired up to a sequence of syslog() syscalls, which might are racy between their checks and actions, regarding concurrency. In the (very uncommon) case of concurrent readers of /dev/kmsg, relying on usual O_NONBLOCK behavior, the recently introduced mutex might block an O_NONBLOCK reader in read(), when poll() returns for it, but another process has already read the data in the meantime. We've seen that while running artificial test setups and tools that "fight" about /proc/kmsg data. This restores the original /proc/kmsg behavior, where in case of concurrent read()s, poll() might wake up but the read() syscall will just return 0 to the caller, while another process has "stolen" the data. This is in the general case not the expected behavior, but it is the exact same one, that can easily be triggered with a 3.4 kernel, and some tools might just rely on it. The mutex is not needed, the original integrity issue which introduced it, is in the meantime covered by: "fill buffer with more than a single message for SYSLOG_ACTION_READ" 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Cc: Yuanhan Liu Acked-by: Jan Beulich Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 37cde752cb8a..be9a82b2f0b3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1021,7 +1021,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; - static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1048,17 +1047,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } - error = mutex_lock_interruptible(&syslog_mutex); - if (error) - goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) { - mutex_unlock(&syslog_mutex); + if (error) goto out; - } error = syslog_print(buf, len); - mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: -- cgit v1.2.3 From 68b6507dc554ba015b5ed5e13b1ed4993cdf4024 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: make sure all messages reach a newly registered boot console We suppress printing kmsg records to the console, which are already printed immediately while we have received their fragments. Newly registered boot consoles print the entire kmsg buffer during registration. Clear the console-suppress flag after we skipped the record during its first storage, so any later print will see these records as usual. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index be9a82b2f0b3..f02f1f5ddc30 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1953,6 +1953,12 @@ skip: */ console_idx = log_next(console_idx); console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; goto skip; } -- cgit v1.2.3 From 7db5b3ca0ecdb2e8fad52a4770e4e320e61c77a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Jul 2012 15:55:47 -0700 Subject: Revert "cgroup: superblock can't be released with active dentries" This reverts commit fa980ca87d15bb8a1317853f257a505990f3ffde. The commit was an attempt to fix a race condition where a cgroup hierarchy may be unmounted with positive dentry reference on root cgroup. While the commit made the race condition slightly more difficult to trigger, the race was still there and could be reliably triggered using a different test case. Revert the incorrect fix. The next commit will describe the race and fix it correctly. Signed-off-by: Tejun Heo LKML-Reference: <4FEEA5CB.8070809@huawei.com> Reported-by: shyju pv Cc: Sasha Levin Acked-by: Li Zefan --- kernel/cgroup.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2097684cf194..5f134a0e0e3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * We want to drop the active superblock reference from the - * cgroup creation after all the dentry refs are gone - - * kill_sb gets mighty unhappy otherwise. Mark - * dentry->d_fsdata with cgroup_diput() to tell - * cgroup_d_release() to call deactivate_super(). + * Drop the active superblock reference that we took when we + * created the cgroup */ - dentry->d_fsdata = cgroup_diput; + deactivate_super(cgrp->root->sb); /* * if we're getting rid of the cgroup, refcount should ensure @@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d) return 1; } -static void cgroup_d_release(struct dentry *dentry) -{ - /* did cgroup_diput() tell me to deactivate super? */ - if (dentry->d_fsdata == cgroup_diput) - deactivate_super(dentry->d_sb); -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1547,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, - .d_release = cgroup_d_release, }; struct inode *inode = -- cgit v1.2.3 From 5db9a4d99b0157a513944e9a44d29c9cec2e91dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Jul 2012 16:08:18 -0700 Subject: cgroup: fix cgroup hierarchy umount race 48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal optional" allowed a css to linger after the associated cgroup is removed. As a css holds a reference on the cgroup's dentry, it means that cgroup dentries may linger for a while. Destroying a superblock which has dentries with positive refcnts is a critical bug and triggers BUG() in vfs code. As each cgroup dentry holds an s_active reference, any lingering cgroup has both its dentry and the superblock pinned and thus preventing premature release of superblock. Unfortunately, after 48ddbe1946, there's a small window while releasing a cgroup which is directly under the root of the hierarchy. When a cgroup directory is released, vfs layer first deletes the corresponding dentry and then invokes dput() on the parent, which may recurse further, so when a cgroup directly below root cgroup is released, the cgroup is first destroyed - which releases the s_active it was holding - and then the dentry for the root cgroup is dput(). This creates a window where the root dentry's refcnt isn't zero but superblock's s_active is. If umount happens before or during this window, vfs will see the root dentry with non-zero refcnt and trigger BUG(). Before 48ddbe1946, this problem didn't exist because the last dentry reference was guaranteed to be put synchronously from rmdir(2) invocation which holds s_active around the whole process. Fix it by holding an extra superblock->s_active reference across dput() from css release, which is the dput() path added by 48ddbe1946 and the only one which doesn't hold an extra s_active ref across the final cgroup dput(). Signed-off-by: Tejun Heo LKML-Reference: <4FEEA5CB.8070809@huawei.com> Reported-by: shyju pv Tested-by: shyju pv Cc: Sasha Levin Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5f134a0e0e3f..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3883,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, dput_work); + struct dentry *dentry = css->cgroup->dentry; + struct super_block *sb = dentry->d_sb; - dput(css->cgroup->dentry); + atomic_inc(&sb->s_active); + dput(dentry); + deactivate_super(sb); } static void init_cgroup_css(struct cgroup_subsys_state *css, -- cgit v1.2.3 From eb02dac93708f581c99858a19162af8ca2b6bfcb Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 9 Jul 2012 10:05:10 -0700 Subject: kmsg: /proc/kmsg - support reading of partial log records Restore support for partial reads of any size on /proc/kmsg, in case the supplied read buffer is smaller than the record size. Some people seem to think is is ia good idea to run: $ dd if=/proc/kmsg bs=1 of=... as a klog bridge. Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=44211 Reported-by: Jukka Ollila Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f02f1f5ddc30..50c33411442d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -217,6 +217,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -890,22 +891,33 @@ static int syslog_print(char __user *buf, int size) while (size > 0) { size_t n; + size_t skip; raw_spin_lock_irq(&logbuf_lock); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_partial = 0; } if (syslog_seq == log_next_seq) { raw_spin_unlock_irq(&logbuf_lock); break; } + + skip = syslog_partial; msg = log_from_idx(syslog_idx); n = msg_print_text(msg, true, text, LOG_LINE_MAX); - if (n <= size) { + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; + n -= syslog_partial; + syslog_partial = 0; + } else if (!len){ + /* partial read(), remember position */ + n = size; + syslog_partial += n; } else n = 0; raw_spin_unlock_irq(&logbuf_lock); @@ -913,17 +925,15 @@ static int syslog_print(char __user *buf, int size) if (!n) break; - len += n; - size -= n; - buf += n; - n = copy_to_user(buf - n, text, n); - - if (n) { - len -= n; + if (copy_to_user(buf, text + skip, n)) { if (!len) len = -EFAULT; break; } + + len += n; + size -= n; + buf += n; } kfree(text); @@ -1107,6 +1117,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_partial = 0; } if (from_file) { /* @@ -1129,6 +1140,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) idx = log_next(idx); seq++; } + error -= syslog_partial; } raw_spin_unlock_irq(&logbuf_lock); break; -- cgit v1.2.3 From ce27e317ba22b359bde02216afab934dac3af095 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Jul 2012 10:38:06 -0700 Subject: cgroup: cgroup_rm_files() was calling simple_unlink() with the wrong inode While refactoring cgroup file removal path, 05ef1d7c4a "cgroup: introduce struct cfent" incorrectly changed the @dir argument of simple_unlink() to the inode of the file being deleted instead of that of the containing directory. The effect of this bug is minor - ctime and mtime of the parent weren't properly updated on file deletion. Fix it by using @cgrp->dentry->d_inode instead. Signed-off-by: Tejun Heo Reported-by: Al Viro Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dec62f5936ef..a56805aa0f1b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -959,7 +959,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) dget(d); d_delete(d); - simple_unlink(d->d_inode, d); + simple_unlink(cgrp->dentry->d_inode, d); list_del_init(&cfe->node); dput(d); -- cgit v1.2.3 From 5becfb1df5ac8e491338e64b1029685ccad4b39c Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 9 Jul 2012 12:15:42 -0700 Subject: kmsg: merge continuation records while printing In (the unlikely) case our continuation merge buffer is busy, we unfortunately can not merge further continuation printk()s into a single record and have to store them separately, which leads to split-up output of these lines when they are printed. Add some flags about newlines and prefix existence to these records and try to reconstruct the full line again, when the separated records are printed. Reported-By: Michael Neuling Cc: Dave Jones Cc: Linus Torvalds Tested-By: Michael Neuling Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 120 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 50c33411442d..177fa49357a5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -194,8 +194,10 @@ static int console_may_schedule; */ enum log_flags { - LOG_DEFAULT = 0, - LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_PREFIX = 4, /* text started with a prefix */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ }; struct log { @@ -217,6 +219,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static enum log_flags syslog_prev; static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ @@ -839,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; + bool prefix = true; + bool newline = true; size_t len = 0; + if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) + prefix = false; + + if (msg->flags & LOG_CONT) { + if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) + prefix = false; + + if (!(msg->flags & LOG_NEWLINE)) + newline = false; + } + do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -863,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, text_len + 1>= size - len) break; - len += print_prefix(msg, syslog, buf + len); + if (prefix) + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - buf[len++] = '\n'; + if (next || newline) + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); - len += text_len + 1; + if (prefix) + len += print_prefix(msg, syslog, NULL); + len += text_len; + if (next || newline) + len++; } + prefix = true; text = next; } while (text); @@ -898,6 +920,7 @@ static int syslog_print(char __user *buf, int size) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; syslog_partial = 0; } if (syslog_seq == log_next_seq) { @@ -907,11 +930,12 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; + syslog_prev = msg->flags; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -954,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + enum log_flags prev; if (clear_seq < log_first_seq) { /* messages are gone, move to first available one */ @@ -967,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) */ seq = clear_seq; idx = clear_idx; + prev = 0; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } @@ -978,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; + prev = 0; while (len > size && seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } @@ -990,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; + prev = 0; while (len >= 0 && seq < next_seq) { struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); if (textlen < 0) { len = textlen; break; } idx = log_next(idx); seq++; + prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -1013,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; + prev = 0; } } } @@ -1117,6 +1147,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; syslog_partial = 0; } if (from_file) { @@ -1127,18 +1158,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) */ error = log_next_idx - syslog_idx; } else { - u64 seq; - u32 idx; + u64 seq = syslog_seq; + u32 idx = syslog_idx; + enum log_flags prev = syslog_prev; error = 0; - seq = syslog_seq; - idx = syslog_idx; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } error -= syslog_partial; } @@ -1408,10 +1439,9 @@ asmlinkage int vprintk_emit(int facility, int level, static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; + enum log_flags lflags = 0; unsigned long flags; int this_cpu; - bool newline = false; - bool prefix = false; int printed_len = 0; boot_delay_msec(); @@ -1450,7 +1480,7 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_DEFAULT, 0, + log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, printed_len); } @@ -1463,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { text_len--; - newline = true; + lflags |= LOG_NEWLINE; } /* strip syslog prefix and extract log level or control flags */ @@ -1473,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = text[1] - '0'; case 'd': /* KERN_DEFAULT */ - prefix = true; + lflags |= LOG_PREFIX; case 'c': /* KERN_CONT */ text += 3; text_len -= 3; @@ -1483,22 +1513,20 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = default_message_loglevel; - if (dict) { - prefix = true; - newline = true; - } + if (dict) + lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!newline) { + if (!(lflags & LOG_NEWLINE)) { /* * Flush the conflicting buffer. An earlier newline was missing, * or another task also prints continuation lines. */ - if (cont.len && (prefix || cont.owner != current)) + if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) cont_flush(); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, LOG_DEFAULT, 0, + log_store(facility, level, lflags | LOG_CONT, 0, dict, dictlen, text, text_len); } else { bool stored = false; @@ -1510,13 +1538,13 @@ asmlinkage int vprintk_emit(int facility, int level, * flush it out and store this line separately. */ if (cont.len && cont.owner == current) { - if (!prefix) + if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); cont_flush(); } if (!stored) - log_store(facility, level, LOG_DEFAULT, 0, + log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } printed_len += text_len; @@ -1615,8 +1643,8 @@ static struct cont { static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1892,6 +1920,7 @@ void wake_up_klogd(void) /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static enum log_flags console_prev; /** * console_unlock - unlock the console system @@ -1952,6 +1981,7 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; + console_prev = 0; } skip: if (console_seq == log_next_seq) @@ -1975,10 +2005,11 @@ skip: } level = msg->level; - len = msg_print_text(msg, false, text, sizeof(text)); - + len = msg_print_text(msg, console_prev, false, + text, sizeof(text)); console_idx = log_next(console_idx); console_seq++; + console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2241,6 +2272,7 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; + console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -2534,8 +2566,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, } msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, - line, size); + l = msg_print_text(msg, 0, syslog, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -2575,6 +2606,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 idx; u64 next_seq; u32 next_idx; + enum log_flags prev; size_t l = 0; bool ret = false; @@ -2597,23 +2629,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; + prev = 0; while (seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l += msg_print_text(msg, true, NULL, 0); + l += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; + prev = 0; while (l > size && seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l -= msg_print_text(msg, true, NULL, 0); + l -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } /* last message in next interation */ @@ -2621,14 +2657,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; + prev = 0; while (seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l += msg_print_text(msg, syslog, - buf + l, size - l); - + l += msg_print_text(msg, prev, syslog, buf + l, size - l); idx = log_next(idx); seq++; + prev = msg->flags; } dumper->next_seq = next_seq; -- cgit v1.2.3 From f55a6faa384304c89cfef162768e88374d3312cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:19 -0400 Subject: hrtimer: Provide clock_was_set_delayed() clock_was_set() cannot be called from hard interrupt context because it calls on_each_cpu(). For fixing the widely reported leap seconds issue it is necessary to call it from hard interrupt context, i.e. the timer tick code, which does the timekeeping updates. Provide a new function which denotes it in the hrtimer cpu base structure of the cpu on which it is called and raise the hrtimer softirq. We then execute the clock_was_set() notificiation from softirq context in run_hrtimer_softirq(). The hrtimer softirq is rarely used, so polling the flag there is not a performance issue. [ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get rid of all this ifdeffery ASAP ] Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..3c24fb2c25c8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void) return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } -- cgit v1.2.3 From 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:20 -0400 Subject: timekeeping: Fix leapsecond triggered load spike issue The timekeeping code misses an update of the hrtimer subsystem after a leap second happened. Due to that timers based on CLOCK_REALTIME are either expiring a second early or late depending on whether a leap second has been inserted or deleted until an operation is initiated which causes that update. Unless the update happens by some other means this discrepancy between the timekeeping and the hrtimer data stays forever and timers are expired either early or late. The reported immediate workaround - $ data -s "`date`" - is causing a call to clock_was_set() which updates the hrtimer data structures. See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix Add the missing clock_was_set() call to update_wall_time() in case of a leap second event. The actual update is deferred to softirq context as the necessary smp function call cannot be invoked from hard interrupt context. Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..a413e5940e06 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -963,6 +963,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } /* Accumulate raw time */ @@ -1079,6 +1081,8 @@ static void update_wall_time(void) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } timekeeping_update(false); -- cgit v1.2.3 From 5b9fe759a678e05be4937ddf03d50e950207c1c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:21 -0400 Subject: timekeeping: Maintain ktime_t based offsets for hrtimers We need to update the hrtimer clock offsets from the hrtimer interrupt context. To avoid conversions from timespec to ktime_t maintain a ktime_t based representation of those offsets in the timekeeper. This puts the conversion overhead into the code which updates the underlying offsets and provides fast accessible values in the hrtimer interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a413e5940e06..1c038dac71a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,12 @@ struct timekeeper { /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ struct timespec raw_time; + /* Offset clock monotonic -> clock realtime */ + ktime_t offs_real; + + /* Offset clock monotonic -> clock boottime */ + ktime_t offs_boot; + /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } +static void update_rt_offset(void) +{ + struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + + set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); + timekeeper.offs_real = timespec_to_ktime(tmp); +} + /* must hold write on timekeeper.lock */ static void timekeeping_update(bool clearntp) { @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) timekeeper.ntp_error = 0; ntp_clear(); } + update_rt_offset(); update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -604,6 +619,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_rt_offset(); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -612,6 +628,12 @@ void __init timekeeping_init(void) /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ + timekeeper.total_sleep_time = t; + timekeeper.offs_boot = timespec_to_ktime(t); +} + /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); - timekeeper.total_sleep_time = timespec_add( - timekeeper.total_sleep_time, *delta); + update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); } -- cgit v1.2.3 From 196951e91262fccda81147d2bcf7fdab08668b40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:23 -0400 Subject: hrtimers: Move lock held region in hrtimer_interrupt() We need to update the base offsets from this code and we need to do that under base->lock. Move the lock held region around the ktime_get() calls. The ktime_get() calls are going to be replaced with a function which gets the time and the offsets atomically. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-6-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3c24fb2c25c8..8f320af837b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1263,11 +1263,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; + raw_spin_lock(&cpu_base->lock); entry_time = now = ktime_get(); retry: expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1344,6 +1343,7 @@ retry: * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. */ + raw_spin_lock(&cpu_base->lock); now = ktime_get(); cpu_base->nr_retries++; if (++retries < 3) @@ -1356,6 +1356,7 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; -- cgit v1.2.3 From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:24 -0400 Subject: timekeeping: Provide hrtimer update function To finally fix the infamous leap second issue and other race windows caused by functions which change the offsets between the various time bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a function which atomically gets the current monotonic time and updates the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic overhead. The previous patch which provides ktime_t offsets allows us to make this function almost as cheap as ktime_get() which is going to be replaced in hrtimer_interrupt(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1c038dac71a2..269b1fe5f2ae 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1271,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&timekeeper.lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&timekeeper.lock); + + secs = timekeeper.xtime.tv_sec; + nsecs = timekeeper.xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *offs_real = timekeeper.offs_real; + *offs_boot = timekeeper.offs_boot; + } while (read_seqretry(&timekeeper.lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ -- cgit v1.2.3 From 5baefd6d84163443215f4a99f6a20f054ef11236 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:25 -0400 Subject: hrtimer: Update hrtimer base offsets each hrtimer_interrupt The update of the hrtimer base offsets on all cpus cannot be made atomically from the timekeeper.lock held and interrupt disabled region as smp function calls are not allowed there. clock_was_set(), which enforces the update on all cpus, is called either from preemptible process context in case of do_settimeofday() or from the softirq context when the offset modification happened in the timer interrupt itself due to a leap second. In both cases there is a race window for an hrtimer interrupt between dropping timekeeper lock, enabling interrupts and clock_was_set() issuing the updates. Any interrupt which arrives in that window will see the new time but operate on stale offsets. So we need to make sure that an hrtimer interrupt always sees a consistent state of time and offsets. ktime_get_update_offsets() allows us to get the current monotonic time and update the per cpu hrtimer base offsets from hrtimer_interrupt() to capture a consistent state of monotonic time and the offsets. The function replaces the existing ktime_get() calls in hrtimer_interrupt(). The overhead of the new function vs. ktime_get() is minimal as it just adds two store operations. This ensures that any changes to realtime or boottime offsets are noticed and stored into the per-cpu hrtimer base structures, prior to any hrtimer expiration and guarantees that timers are not expired early. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-8-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8f320af837b5..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + + return ktime_get_update_offsets(offs_real, offs_boot); +} + /* * Retrigger next event is called after clock was set * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; if (!hrtimer_hres_active()) return; - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - + hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } @@ -710,7 +708,6 @@ static int hrtimer_switch_to_hres(void) base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); @@ -1264,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; raw_spin_lock(&cpu_base->lock); - entry_time = now = ktime_get(); + entry_time = now = hrtimer_update_base(cpu_base); retry: expires_next.tv64 = KTIME_MAX; /* @@ -1342,9 +1339,12 @@ retry: * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. */ raw_spin_lock(&cpu_base->lock); - now = ktime_get(); + now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; -- cgit v1.2.3 From 4229fb1dc6843c49a14bb098719f8a696cdc44f8 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 11 Jul 2012 14:02:11 -0700 Subject: c/r: prctl: less paranoid prctl_set_mm_exe_file() "no other files mapped" requirement from my previous patch (c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal) is too paranoid, it forbids operation even if there mapped one shared-anon vma. Let's check that current mm->exe_file already unmapped, in this case exe_file symlink already outdated and its changing is reasonable. Plus, this patch fixes exit code in case operation success. Signed-off-by: Konstantin Khlebnikov Reported-by: Cyrill Gorcunov Tested-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e0c8ffc50d7f..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) down_write(&mm->mmap_sem); /* - * Forbid mm->exe_file change if there are mapped other files. + * Forbid mm->exe_file change if old file still mapped. */ err = -EBUSY; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_file && !path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_unlock; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; } /* @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) goto exit_unlock; + err = 0; set_mm_exe_file(mm, exe_file); exit_unlock: up_write(&mm->mmap_sem); -- cgit v1.2.3 From 93574fcc5b50cc7b8834698acb2ce947e5b6a5dc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Jul 2012 09:35:08 +0300 Subject: tracing: Check for allocation failure in __tracing_open() Clean up and return -ENOMEM on if the kzalloc() fails. This also prevents a potential crash, as the pointer that failed to allocate would be later used. Link: http://lkml.kernel.org/r/20120711063507.GF11812@elgon.mountain Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 814ff306ae74..a120f98c4112 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2390,6 +2390,9 @@ __tracing_open(struct inode *inode, struct file *file) iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2451,6 +2454,7 @@ __tracing_open(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); kfree(iter->trace); kfree(iter->buffer_iter); +release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From 974271c485a4d8bb801decc616748f90aafb07ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Jul 2012 14:46:37 -0700 Subject: workqueue: don't use WQ_HIGHPRI for unbound workqueues Unbound wqs aren't concurrency-managed and try to execute work items as soon as possible. This is currently achieved by implicitly setting %WQ_HIGHPRI on all unbound workqueues; however, WQ_HIGHPRI implementation is about to be restructured and this usage won't be valid anymore. Add an explicit chain-wakeup path for unbound workqueues in process_one_work() instead of piggy backing on %WQ_HIGHPRI. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128dc67df..27637c284cb9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -580,6 +580,10 @@ static bool __need_more_worker(struct global_cwq *gcwq) /* * Need to wake up a worker? Called from anything but currently * running workers. + * + * Note that, because unbound workers never contribute to nr_running, this + * function will always return %true for unbound gcwq as long as the + * worklist isn't empty. */ static bool need_more_worker(struct global_cwq *gcwq) { @@ -1867,6 +1871,13 @@ __acquires(&gcwq->lock) if (unlikely(cpu_intensive)) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + /* + * Unbound gcwq isn't concurrency managed and work items should be + * executed ASAP. Wake up another worker if necessary. + */ + if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq)) + wake_up_worker(gcwq); + spin_unlock_irq(&gcwq->lock); work_clear_pending(work); @@ -2984,13 +2995,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) flags |= WQ_RESCUER; - /* - * Unbound workqueues aren't concurrency managed and should be - * dispatched to workers immediately. - */ - if (flags & WQ_UNBOUND) - flags |= WQ_HIGHPRI; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); -- cgit v1.2.3 From bd7bdd43dcb81bb08240b9401b36a104f77dc135 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Jul 2012 14:46:37 -0700 Subject: workqueue: factor out worker_pool from global_cwq Move worklist and all worker management fields from global_cwq into the new struct worker_pool. worker_pool points back to the containing gcwq. worker and cpu_workqueue_struct are updated to point to worker_pool instead of gcwq too. This change is mechanical and doesn't introduce any functional difference other than rearranging of fields and an added level of indirection in some places. This is to prepare for multiple pools per gcwq. v2: Comment typo fixes as suggested by Namhyung. Signed-off-by: Tejun Heo Cc: Namhyung Kim --- kernel/workqueue.c | 216 +++++++++++++++++++++++++++++------------------------ 1 file changed, 117 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 27637c284cb9..61f154467026 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -115,6 +115,7 @@ enum { */ struct global_cwq; +struct worker_pool; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -131,7 +132,7 @@ struct worker { struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ @@ -139,6 +140,21 @@ struct worker { struct work_struct rebind_work; /* L: rebind worker to cpu */ }; +struct worker_pool { + struct global_cwq *gcwq; /* I: the owning gcwq */ + + struct list_head worklist; /* L: list of pending works */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + struct list_head idle_list; /* X: list of idle workers */ + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for workers */ + + struct ida worker_ida; /* L: for worker IDs */ + struct worker *first_idle; /* L: first idle worker */ +}; + /* * Global per-cpu workqueue. There's one and only one for each cpu * and all works are queued and processed here regardless of their @@ -146,27 +162,18 @@ struct worker { */ struct global_cwq { spinlock_t lock; /* the gcwq lock */ - struct list_head worklist; /* L: list of pending works */ unsigned int cpu; /* I: the associated cpu */ unsigned int flags; /* L: GCWQ_* flags */ - int nr_workers; /* L: total number of workers */ - int nr_idle; /* L: currently idle ones */ - - /* workers are chained either in the idle_list or busy_hash */ - struct list_head idle_list; /* X: list of idle workers */ + /* workers are chained either in busy_hash or pool idle_list */ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for dworkers */ - - struct ida worker_ida; /* L: for worker IDs */ + struct worker_pool pool; /* the worker pools */ struct task_struct *trustee; /* L: for gcwq shutdown */ unsigned int trustee_state; /* L: trustee state */ wait_queue_head_t trustee_wait; /* trustee wait */ - struct worker *first_idle; /* L: first idle worker */ } ____cacheline_aligned_in_smp; /* @@ -175,7 +182,7 @@ struct global_cwq { * aligned at two's power of the number of flag bits. */ struct cpu_workqueue_struct { - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ @@ -555,7 +562,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) if (data & WORK_STRUCT_CWQ) return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; cpu = data >> WORK_STRUCT_FLAG_BITS; if (cpu == WORK_CPU_NONE) @@ -587,13 +594,13 @@ static bool __need_more_worker(struct global_cwq *gcwq) */ static bool need_more_worker(struct global_cwq *gcwq) { - return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); + return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq); } /* Can I start working? Called from busy but !running workers. */ static bool may_start_working(struct global_cwq *gcwq) { - return gcwq->nr_idle; + return gcwq->pool.nr_idle; } /* Do I need to keep working? Called from currently running workers. */ @@ -601,7 +608,7 @@ static bool keep_working(struct global_cwq *gcwq) { atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); - return !list_empty(&gcwq->worklist) && + return !list_empty(&gcwq->pool.worklist) && (atomic_read(nr_running) <= 1 || gcwq->flags & GCWQ_HIGHPRI_PENDING); } @@ -622,8 +629,8 @@ static bool need_to_manage_workers(struct global_cwq *gcwq) static bool too_many_workers(struct global_cwq *gcwq) { bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->nr_workers - nr_idle; + int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->pool.nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -635,10 +642,10 @@ static bool too_many_workers(struct global_cwq *gcwq) /* Return the first worker. Safe with preemption disabled */ static struct worker *first_worker(struct global_cwq *gcwq) { - if (unlikely(list_empty(&gcwq->idle_list))) + if (unlikely(list_empty(&gcwq->pool.idle_list))) return NULL; - return list_first_entry(&gcwq->idle_list, struct worker, entry); + return list_first_entry(&gcwq->pool.idle_list, struct worker, entry); } /** @@ -696,7 +703,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; atomic_t *nr_running = get_gcwq_nr_running(cpu); if (worker->flags & WORKER_NOT_RUNNING) @@ -716,7 +724,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * could be manipulating idle_list, so dereferencing idle_list * without gcwq lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) to_wakeup = first_worker(gcwq); return to_wakeup ? to_wakeup->task : NULL; } @@ -737,7 +745,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; WARN_ON_ONCE(worker->task != current); @@ -752,7 +761,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, if (wakeup) { if (atomic_dec_and_test(nr_running) && - !list_empty(&gcwq->worklist)) + !list_empty(&pool->worklist)) wake_up_worker(gcwq); } else atomic_dec(nr_running); @@ -773,7 +782,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); @@ -894,9 +903,9 @@ static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, struct work_struct *twork; if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->worklist; + return &gcwq->pool.worklist; - list_for_each_entry(twork, &gcwq->worklist, entry) { + list_for_each_entry(twork, &gcwq->pool.worklist, entry) { struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); if (!(tcwq->wq->flags & WQ_HIGHPRI)) @@ -924,7 +933,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct global_cwq *gcwq = cwq->gcwq; + struct global_cwq *gcwq = cwq->pool->gcwq; /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); @@ -1196,7 +1205,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); */ static void worker_enter_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1204,15 +1214,15 @@ static void worker_enter_idle(struct worker *worker) /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; - gcwq->nr_idle++; + pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ - list_add(&worker->entry, &gcwq->idle_list); + list_add(&worker->entry, &pool->idle_list); if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, + if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); } else wake_up_all(&gcwq->trustee_wait); @@ -1223,7 +1233,7 @@ static void worker_enter_idle(struct worker *worker) * warning may trigger spuriously. Check iff trustee is idle. */ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && - gcwq->nr_workers == gcwq->nr_idle && + pool->nr_workers == pool->nr_idle && atomic_read(get_gcwq_nr_running(gcwq->cpu))); } @@ -1238,11 +1248,11 @@ static void worker_enter_idle(struct worker *worker) */ static void worker_leave_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; BUG_ON(!(worker->flags & WORKER_IDLE)); worker_clr_flags(worker, WORKER_IDLE); - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); } @@ -1279,7 +1289,7 @@ static void worker_leave_idle(struct worker *worker) static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&gcwq->lock) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1321,7 +1331,7 @@ __acquires(&gcwq->lock) static void worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_REBIND); @@ -1362,13 +1372,14 @@ static struct worker *alloc_worker(void) static struct worker *create_worker(struct global_cwq *gcwq, bool bind) { bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker_pool *pool = &gcwq->pool; struct worker *worker = NULL; int id = -1; spin_lock_irq(&gcwq->lock); - while (ida_get_new(&gcwq->worker_ida, &id)) { + while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&gcwq->lock); - if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; spin_lock_irq(&gcwq->lock); } @@ -1378,7 +1389,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) if (!worker) goto fail; - worker->gcwq = gcwq; + worker->pool = pool; worker->id = id; if (!on_unbound_cpu) @@ -1409,7 +1420,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) fail: if (id >= 0) { spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); spin_unlock_irq(&gcwq->lock); } kfree(worker); @@ -1428,7 +1439,7 @@ fail: static void start_worker(struct worker *worker) { worker->flags |= WORKER_STARTED; - worker->gcwq->nr_workers++; + worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); } @@ -1444,7 +1455,8 @@ static void start_worker(struct worker *worker) */ static void destroy_worker(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1452,9 +1464,9 @@ static void destroy_worker(struct worker *worker) BUG_ON(!list_empty(&worker->scheduled)); if (worker->flags & WORKER_STARTED) - gcwq->nr_workers--; + pool->nr_workers--; if (worker->flags & WORKER_IDLE) - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1465,7 +1477,7 @@ static void destroy_worker(struct worker *worker) kfree(worker); spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __gcwq) @@ -1479,11 +1491,12 @@ static void idle_worker_timeout(unsigned long __gcwq) unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(gcwq->pool.idle_list.prev, struct worker, + entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&gcwq->pool.idle_timer, expires); else { /* it's been idle for too long, wake up manager */ gcwq->flags |= GCWQ_MANAGE_WORKERS; @@ -1504,7 +1517,7 @@ static bool send_mayday(struct work_struct *work) return false; /* mayday mayday mayday */ - cpu = cwq->gcwq->cpu; + cpu = cwq->pool->gcwq->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -1527,13 +1540,13 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * allocation deadlock. Send distress signals to * rescuers. */ - list_for_each_entry(work, &gcwq->worklist, entry) + list_for_each_entry(work, &gcwq->pool.worklist, entry) send_mayday(work); } spin_unlock_irq(&gcwq->lock); - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); + mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL); } /** @@ -1568,14 +1581,14 @@ restart: spin_unlock_irq(&gcwq->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { struct worker *worker; worker = create_worker(gcwq, true); if (worker) { - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&gcwq->pool.mayday_timer); spin_lock_irq(&gcwq->lock); start_worker(worker); BUG_ON(need_to_create_worker(gcwq)); @@ -1592,7 +1605,7 @@ restart: break; } - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&gcwq->pool.mayday_timer); spin_lock_irq(&gcwq->lock); if (need_to_create_worker(gcwq)) goto restart; @@ -1622,11 +1635,12 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) struct worker *worker; unsigned long expires; - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(gcwq->pool.idle_list.prev, struct worker, + entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&gcwq->pool.idle_timer, expires); break; } @@ -1659,7 +1673,7 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) */ static bool manage_workers(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; bool ret = false; if (gcwq->flags & GCWQ_MANAGING_WORKERS) @@ -1732,7 +1746,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq); trace_workqueue_activate_work(work); move_linked_works(work, pos, NULL); @@ -1808,7 +1822,8 @@ __releases(&gcwq->lock) __acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; struct hlist_head *bwh = busy_worker_head(gcwq, work); bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; work_func_t f = work->func; @@ -1854,10 +1869,10 @@ __acquires(&gcwq->lock) * wake up another worker; otherwise, clear HIGHPRI_PENDING. */ if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&gcwq->worklist, - struct work_struct, entry); + struct work_struct *nwork = list_first_entry(&pool->worklist, + struct work_struct, entry); - if (!list_empty(&gcwq->worklist) && + if (!list_empty(&pool->worklist) && get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) wake_up_worker(gcwq); else @@ -1950,7 +1965,8 @@ static void process_scheduled_works(struct worker *worker) static int worker_thread(void *__worker) { struct worker *worker = __worker; - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; @@ -1990,7 +2006,7 @@ recheck: do { struct work_struct *work = - list_first_entry(&gcwq->worklist, + list_first_entry(&pool->worklist, struct work_struct, entry); if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { @@ -2064,14 +2080,15 @@ repeat: for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->gcwq = gcwq; + rescuer->pool = pool; worker_maybe_bind_and_lock(rescuer); /* @@ -2079,7 +2096,7 @@ repeat: * process'em. */ BUG_ON(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_cwq(work) == cwq) move_linked_works(work, scheduled, &n); @@ -2216,7 +2233,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct global_cwq *gcwq = cwq->pool->gcwq; spin_lock_irq(&gcwq->lock); @@ -2432,9 +2449,9 @@ reflush: struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->gcwq->lock); + spin_lock_irq(&cwq->pool->gcwq->lock); drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->gcwq->lock); + spin_unlock_irq(&cwq->pool->gcwq->lock); if (drained) continue; @@ -2474,7 +2491,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, */ smp_rmb(); cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->gcwq)) + if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); @@ -3017,7 +3034,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct global_cwq *gcwq = get_gcwq(cpu); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->gcwq = gcwq; + cwq->pool = &gcwq->pool; cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; @@ -3344,7 +3361,7 @@ static int __cpuinit trustee_thread(void *__gcwq) gcwq->flags |= GCWQ_MANAGING_WORKERS; - list_for_each_entry(worker, &gcwq->idle_list, entry) + list_for_each_entry(worker, &gcwq->pool.idle_list, entry) worker->flags |= WORKER_ROGUE; for_each_busy_worker(worker, i, pos, gcwq) @@ -3369,7 +3386,7 @@ static int __cpuinit trustee_thread(void *__gcwq) atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); + del_timer_sync(&gcwq->pool.idle_timer); spin_lock_irq(&gcwq->lock); /* @@ -3391,17 +3408,17 @@ static int __cpuinit trustee_thread(void *__gcwq) * may be frozen works in freezable cwqs. Don't declare * completion while frozen. */ - while (gcwq->nr_workers != gcwq->nr_idle || + while (gcwq->pool.nr_workers != gcwq->pool.nr_idle || gcwq->flags & GCWQ_FREEZING || gcwq->trustee_state == TRUSTEE_IN_CHARGE) { int nr_works = 0; - list_for_each_entry(work, &gcwq->worklist, entry) { + list_for_each_entry(work, &gcwq->pool.worklist, entry) { send_mayday(work); nr_works++; } - list_for_each_entry(worker, &gcwq->idle_list, entry) { + list_for_each_entry(worker, &gcwq->pool.idle_list, entry) { if (!nr_works--) break; wake_up_process(worker->task); @@ -3428,11 +3445,11 @@ static int __cpuinit trustee_thread(void *__gcwq) * all workers till we're canceled. */ do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, + rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list)); + while (!list_empty(&gcwq->pool.idle_list)) + destroy_worker(list_first_entry(&gcwq->pool.idle_list, struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); + } while (gcwq->pool.nr_workers && rc >= 0); /* * At this point, either draining has completed and no worker @@ -3441,7 +3458,7 @@ static int __cpuinit trustee_thread(void *__gcwq) * Tell the remaining busy ones to rebind once it finishes the * currently scheduled works by scheduling the rebind_work. */ - WARN_ON(!list_empty(&gcwq->idle_list)); + WARN_ON(!list_empty(&gcwq->pool.idle_list)); for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; @@ -3522,7 +3539,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, kthread_bind(new_trustee, cpu); /* fall through */ case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); + BUG_ON(gcwq->pool.first_idle); new_worker = create_worker(gcwq, false); if (!new_worker) { if (new_trustee) @@ -3544,8 +3561,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); /* fall through */ case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; + BUG_ON(gcwq->pool.first_idle); + gcwq->pool.first_idle = new_worker; break; case CPU_DYING: @@ -3562,8 +3579,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, gcwq->trustee_state = TRUSTEE_BUTCHER; /* fall through */ case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + destroy_worker(gcwq->pool.first_idle); + gcwq->pool.first_idle = NULL; break; case CPU_DOWN_FAILED: @@ -3581,11 +3598,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, * take a look. */ spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); + kthread_bind(gcwq->pool.first_idle->task, cpu); spin_lock_irq(&gcwq->lock); gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + start_worker(gcwq->pool.first_idle); + gcwq->pool.first_idle = NULL; break; } @@ -3794,22 +3811,23 @@ static int __init init_workqueues(void) struct global_cwq *gcwq = get_gcwq(cpu); spin_lock_init(&gcwq->lock); - INIT_LIST_HEAD(&gcwq->worklist); + gcwq->pool.gcwq = gcwq; + INIT_LIST_HEAD(&gcwq->pool.worklist); gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - INIT_LIST_HEAD(&gcwq->idle_list); + INIT_LIST_HEAD(&gcwq->pool.idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - init_timer_deferrable(&gcwq->idle_timer); - gcwq->idle_timer.function = idle_worker_timeout; - gcwq->idle_timer.data = (unsigned long)gcwq; + init_timer_deferrable(&gcwq->pool.idle_timer); + gcwq->pool.idle_timer.function = idle_worker_timeout; + gcwq->pool.idle_timer.data = (unsigned long)gcwq; - setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout, (unsigned long)gcwq); - ida_init(&gcwq->worker_ida); + ida_init(&gcwq->pool.worker_ida); gcwq->trustee_state = TRUSTEE_DONE; init_waitqueue_head(&gcwq->trustee_wait); -- cgit v1.2.3 From 63d95a9150ee3bbd4117fcd609dee40313b454d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Jul 2012 14:46:37 -0700 Subject: workqueue: use @pool instead of @gcwq or @cpu where applicable Modify all functions which deal with per-pool properties to pass around @pool instead of @gcwq or @cpu. The changes in this patch are mechanical and don't caues any functional difference. This is to prepare for multiple pools per gcwq. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 218 +++++++++++++++++++++++++++-------------------------- 1 file changed, 111 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61f154467026..2d82f7b193a0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -471,8 +471,10 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } -static atomic_t *get_gcwq_nr_running(unsigned int cpu) +static atomic_t *get_pool_nr_running(struct worker_pool *pool) { + int cpu = pool->gcwq->cpu; + if (cpu != WORK_CPU_UNBOUND) return &per_cpu(gcwq_nr_running, cpu); else @@ -578,10 +580,10 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) * assume that they're being called with gcwq->lock held. */ -static bool __need_more_worker(struct global_cwq *gcwq) +static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || - gcwq->flags & GCWQ_HIGHPRI_PENDING; + return !atomic_read(get_pool_nr_running(pool)) || + pool->gcwq->flags & GCWQ_HIGHPRI_PENDING; } /* @@ -592,45 +594,46 @@ static bool __need_more_worker(struct global_cwq *gcwq) * function will always return %true for unbound gcwq as long as the * worklist isn't empty. */ -static bool need_more_worker(struct global_cwq *gcwq) +static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq); + return !list_empty(&pool->worklist) && __need_more_worker(pool); } /* Can I start working? Called from busy but !running workers. */ -static bool may_start_working(struct global_cwq *gcwq) +static bool may_start_working(struct worker_pool *pool) { - return gcwq->pool.nr_idle; + return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ -static bool keep_working(struct global_cwq *gcwq) +static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); - return !list_empty(&gcwq->pool.worklist) && + return !list_empty(&pool->worklist) && (atomic_read(nr_running) <= 1 || - gcwq->flags & GCWQ_HIGHPRI_PENDING); + pool->gcwq->flags & GCWQ_HIGHPRI_PENDING); } /* Do we need a new worker? Called from manager. */ -static bool need_to_create_worker(struct global_cwq *gcwq) +static bool need_to_create_worker(struct worker_pool *pool) { - return need_more_worker(gcwq) && !may_start_working(gcwq); + return need_more_worker(pool) && !may_start_working(pool); } /* Do I need to be the manager? */ -static bool need_to_manage_workers(struct global_cwq *gcwq) +static bool need_to_manage_workers(struct worker_pool *pool) { - return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; + return need_to_create_worker(pool) || + pool->gcwq->flags & GCWQ_MANAGE_WORKERS; } /* Do we have too many workers and should some go away? */ -static bool too_many_workers(struct global_cwq *gcwq) +static bool too_many_workers(struct worker_pool *pool) { - bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->pool.nr_workers - nr_idle; + bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ + int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -640,26 +643,26 @@ static bool too_many_workers(struct global_cwq *gcwq) */ /* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct global_cwq *gcwq) +static struct worker *first_worker(struct worker_pool *pool) { - if (unlikely(list_empty(&gcwq->pool.idle_list))) + if (unlikely(list_empty(&pool->idle_list))) return NULL; - return list_first_entry(&gcwq->pool.idle_list, struct worker, entry); + return list_first_entry(&pool->idle_list, struct worker, entry); } /** * wake_up_worker - wake up an idle worker - * @gcwq: gcwq to wake worker for + * @pool: worker pool to wake worker from * - * Wake up the first idle worker of @gcwq. + * Wake up the first idle worker of @pool. * * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void wake_up_worker(struct global_cwq *gcwq) +static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(gcwq); + struct worker *worker = first_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -681,7 +684,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_pool_nr_running(worker->pool)); } /** @@ -704,8 +707,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; - atomic_t *nr_running = get_gcwq_nr_running(cpu); + atomic_t *nr_running = get_pool_nr_running(pool); if (worker->flags & WORKER_NOT_RUNNING) return NULL; @@ -725,7 +727,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * without gcwq lock is safe. */ if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(gcwq); + to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -746,7 +748,6 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; WARN_ON_ONCE(worker->task != current); @@ -757,12 +758,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); if (wakeup) { if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) - wake_up_worker(gcwq); + wake_up_worker(pool); } else atomic_dec(nr_running); } @@ -782,7 +783,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { - struct global_cwq *gcwq = worker->pool->gcwq; + struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); @@ -796,7 +797,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(gcwq->cpu)); + atomic_inc(get_pool_nr_running(pool)); } /** @@ -880,15 +881,15 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** - * gcwq_determine_ins_pos - find insertion position - * @gcwq: gcwq of interest + * pool_determine_ins_pos - find insertion position + * @pool: pool of interest * @cwq: cwq a work is being queued for * - * A work for @cwq is about to be queued on @gcwq, determine insertion + * A work for @cwq is about to be queued on @pool, determine insertion * position for the work. If @cwq is for HIGHPRI wq, the work is * queued at the head of the queue but in FIFO order with respect to * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that * there are HIGHPRI works pending. * * CONTEXT: @@ -897,22 +898,22 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, * RETURNS: * Pointer to inserstion position. */ -static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, +static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool, struct cpu_workqueue_struct *cwq) { struct work_struct *twork; if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->pool.worklist; + return &pool->worklist; - list_for_each_entry(twork, &gcwq->pool.worklist, entry) { + list_for_each_entry(twork, &pool->worklist, entry) { struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); if (!(tcwq->wq->flags & WQ_HIGHPRI)) break; } - gcwq->flags |= GCWQ_HIGHPRI_PENDING; + pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING; return &twork->entry; } @@ -933,7 +934,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct global_cwq *gcwq = cwq->pool->gcwq; + struct worker_pool *pool = cwq->pool; /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); @@ -953,8 +954,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, */ smp_mb(); - if (__need_more_worker(gcwq)) - wake_up_worker(gcwq); + if (__need_more_worker(pool)) + wake_up_worker(pool); } /* @@ -1056,7 +1057,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (likely(cwq->nr_active < cwq->max_active)) { trace_workqueue_activate_work(work); cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); + worklist = pool_determine_ins_pos(cwq->pool, cwq); } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; @@ -1221,7 +1222,7 @@ static void worker_enter_idle(struct worker *worker) list_add(&worker->entry, &pool->idle_list); if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer)) + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); } else @@ -1234,7 +1235,7 @@ static void worker_enter_idle(struct worker *worker) */ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && pool->nr_workers == pool->nr_idle && - atomic_read(get_gcwq_nr_running(gcwq->cpu))); + atomic_read(get_pool_nr_running(pool))); } /** @@ -1356,10 +1357,10 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker - * @gcwq: gcwq the new worker will belong to + * @pool: pool the new worker will belong to * @bind: whether to set affinity to @cpu or not * - * Create a new worker which is bound to @gcwq. The returned worker + * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using * destroy_worker(). * @@ -1369,10 +1370,10 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +static struct worker *create_worker(struct worker_pool *pool, bool bind) { + struct global_cwq *gcwq = pool->gcwq; bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; - struct worker_pool *pool = &gcwq->pool; struct worker *worker = NULL; int id = -1; @@ -1480,27 +1481,27 @@ static void destroy_worker(struct worker *worker) ida_remove(&pool->worker_ida, id); } -static void idle_worker_timeout(unsigned long __gcwq) +static void idle_worker_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); - if (too_many_workers(gcwq)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->pool.idle_list.prev, struct worker, - entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) - mod_timer(&gcwq->pool.idle_timer, expires); + mod_timer(&pool->idle_timer, expires); else { /* it's been idle for too long, wake up manager */ gcwq->flags |= GCWQ_MANAGE_WORKERS; - wake_up_worker(gcwq); + wake_up_worker(pool); } } @@ -1526,37 +1527,38 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __gcwq) +static void gcwq_mayday_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { + if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ - list_for_each_entry(work, &gcwq->pool.worklist, entry) + list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } spin_unlock_irq(&gcwq->lock); - mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary - * @gcwq: gcwq to create a new worker for + * @pool: pool to create a new worker for * - * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is - * sent to all rescuers with works scheduled on @gcwq to resolve + * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be false and @@ -1571,52 +1573,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_create_worker(struct global_cwq *gcwq) +static bool maybe_create_worker(struct worker_pool *pool) __releases(&gcwq->lock) __acquires(&gcwq->lock) { - if (!need_to_create_worker(gcwq)) + struct global_cwq *gcwq = pool->gcwq; + + if (!need_to_create_worker(pool)) return false; restart: spin_unlock_irq(&gcwq->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { struct worker *worker; - worker = create_worker(gcwq, true); + worker = create_worker(pool, true); if (worker) { - del_timer_sync(&gcwq->pool.mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); start_worker(worker); - BUG_ON(need_to_create_worker(gcwq)); + BUG_ON(need_to_create_worker(pool)); return true; } - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(CREATE_COOLDOWN); - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; } - del_timer_sync(&gcwq->pool.mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) + if (need_to_create_worker(pool)) goto restart; return true; } /** * maybe_destroy_worker - destroy workers which have been idle for a while - * @gcwq: gcwq to destroy workers for + * @pool: pool to destroy workers for * - * Destroy @gcwq workers which have been idle for longer than + * Destroy @pool workers which have been idle for longer than * IDLE_WORKER_TIMEOUT. * * LOCKING: @@ -1627,20 +1631,19 @@ restart: * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_destroy_workers(struct global_cwq *gcwq) +static bool maybe_destroy_workers(struct worker_pool *pool) { bool ret = false; - while (too_many_workers(gcwq)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; - worker = list_entry(gcwq->pool.idle_list.prev, struct worker, - entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { - mod_timer(&gcwq->pool.idle_timer, expires); + mod_timer(&pool->idle_timer, expires); break; } @@ -1673,7 +1676,8 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) */ static bool manage_workers(struct worker *worker) { - struct global_cwq *gcwq = worker->pool->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; bool ret = false; if (gcwq->flags & GCWQ_MANAGING_WORKERS) @@ -1686,8 +1690,8 @@ static bool manage_workers(struct worker *worker) * Destroy and then create so that may_start_working() is true * on return. */ - ret |= maybe_destroy_workers(gcwq); - ret |= maybe_create_worker(gcwq); + ret |= maybe_destroy_workers(pool); + ret |= maybe_create_worker(pool); gcwq->flags &= ~GCWQ_MANAGING_WORKERS; @@ -1746,7 +1750,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq); + struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq); trace_workqueue_activate_work(work); move_linked_works(work, pos, NULL); @@ -1874,7 +1878,7 @@ __acquires(&gcwq->lock) if (!list_empty(&pool->worklist) && get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(gcwq); + wake_up_worker(pool); else gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; } @@ -1890,8 +1894,8 @@ __acquires(&gcwq->lock) * Unbound gcwq isn't concurrency managed and work items should be * executed ASAP. Wake up another worker if necessary. */ - if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq)) - wake_up_worker(gcwq); + if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); @@ -1983,11 +1987,11 @@ woke_up: worker_leave_idle(worker); recheck: /* no more worker necessary? */ - if (!need_more_worker(gcwq)) + if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ - if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* @@ -2018,11 +2022,11 @@ recheck: move_linked_works(work, &worker->scheduled, NULL); process_scheduled_works(worker); } - } while (keep_working(gcwq)); + } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) goto recheck; /* @@ -2107,8 +2111,8 @@ repeat: * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(gcwq)) - wake_up_worker(gcwq); + if (keep_working(pool)) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -3383,7 +3387,7 @@ static int __cpuinit trustee_thread(void *__gcwq) * keep_working() are always true as long as the worklist is * not empty. */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + atomic_set(get_pool_nr_running(&gcwq->pool), 0); spin_unlock_irq(&gcwq->lock); del_timer_sync(&gcwq->pool.idle_timer); @@ -3424,9 +3428,9 @@ static int __cpuinit trustee_thread(void *__gcwq) wake_up_process(worker->task); } - if (need_to_create_worker(gcwq)) { + if (need_to_create_worker(&gcwq->pool)) { spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); + worker = create_worker(&gcwq->pool, false); spin_lock_irq(&gcwq->lock); if (worker) { worker->flags |= WORKER_ROGUE; @@ -3540,7 +3544,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, /* fall through */ case CPU_UP_PREPARE: BUG_ON(gcwq->pool.first_idle); - new_worker = create_worker(gcwq, false); + new_worker = create_worker(&gcwq->pool, false); if (!new_worker) { if (new_trustee) kthread_stop(new_trustee); @@ -3788,7 +3792,7 @@ void thaw_workqueues(void) cwq_activate_first_delayed(cwq); } - wake_up_worker(gcwq); + wake_up_worker(&gcwq->pool); spin_unlock_irq(&gcwq->lock); } @@ -3822,10 +3826,10 @@ static int __init init_workqueues(void) init_timer_deferrable(&gcwq->pool.idle_timer); gcwq->pool.idle_timer.function = idle_worker_timeout; - gcwq->pool.idle_timer.data = (unsigned long)gcwq; + gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool; setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout, - (unsigned long)gcwq); + (unsigned long)&gcwq->pool); ida_init(&gcwq->pool.worker_ida); @@ -3840,7 +3844,7 @@ static int __init init_workqueues(void) if (cpu != WORK_CPU_UNBOUND) gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(gcwq, true); + worker = create_worker(&gcwq->pool, true); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); start_worker(worker); -- cgit v1.2.3 From 11ebea50dbc1ade5994b2c838a096078d4c02399 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Jul 2012 14:46:37 -0700 Subject: workqueue: separate out worker_pool flags GCWQ_MANAGE_WORKERS, GCWQ_MANAGING_WORKERS and GCWQ_HIGHPRI_PENDING are per-pool properties. Add worker_pool->flags and make the above three flags per-pool flags. The changes in this patch are mechanical and don't caues any functional difference. This is to prepare for multiple pools per gcwq. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2d82f7b193a0..7a98bae635fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -46,11 +46,13 @@ enum { /* global_cwq flags */ - GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ - GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 3, /* freeze in progress */ - GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 1, /* freeze in progress */ + + /* pool flags */ + POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ + POOL_HIGHPRI_PENDING = 1 << 2, /* highpri works on queue */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -142,6 +144,7 @@ struct worker { struct worker_pool { struct global_cwq *gcwq; /* I: the owning gcwq */ + unsigned int flags; /* X: flags */ struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -583,7 +586,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) static bool __need_more_worker(struct worker_pool *pool) { return !atomic_read(get_pool_nr_running(pool)) || - pool->gcwq->flags & GCWQ_HIGHPRI_PENDING; + (pool->flags & POOL_HIGHPRI_PENDING); } /* @@ -612,7 +615,7 @@ static bool keep_working(struct worker_pool *pool) return !list_empty(&pool->worklist) && (atomic_read(nr_running) <= 1 || - pool->gcwq->flags & GCWQ_HIGHPRI_PENDING); + (pool->flags & POOL_HIGHPRI_PENDING)); } /* Do we need a new worker? Called from manager. */ @@ -625,13 +628,13 @@ static bool need_to_create_worker(struct worker_pool *pool) static bool need_to_manage_workers(struct worker_pool *pool) { return need_to_create_worker(pool) || - pool->gcwq->flags & GCWQ_MANAGE_WORKERS; + (pool->flags & POOL_MANAGE_WORKERS); } /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS; + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -889,7 +892,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, * position for the work. If @cwq is for HIGHPRI wq, the work is * queued at the head of the queue but in FIFO order with respect to * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that + * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that * there are HIGHPRI works pending. * * CONTEXT: @@ -913,7 +916,7 @@ static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool, break; } - pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING; + pool->flags |= POOL_HIGHPRI_PENDING; return &twork->entry; } @@ -1500,7 +1503,7 @@ static void idle_worker_timeout(unsigned long __pool) mod_timer(&pool->idle_timer, expires); else { /* it's been idle for too long, wake up manager */ - gcwq->flags |= GCWQ_MANAGE_WORKERS; + pool->flags |= POOL_MANAGE_WORKERS; wake_up_worker(pool); } } @@ -1680,11 +1683,11 @@ static bool manage_workers(struct worker *worker) struct global_cwq *gcwq = pool->gcwq; bool ret = false; - if (gcwq->flags & GCWQ_MANAGING_WORKERS) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; - gcwq->flags &= ~GCWQ_MANAGE_WORKERS; - gcwq->flags |= GCWQ_MANAGING_WORKERS; + pool->flags &= ~POOL_MANAGE_WORKERS; + pool->flags |= POOL_MANAGING_WORKERS; /* * Destroy and then create so that may_start_working() is true @@ -1693,7 +1696,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + pool->flags &= ~POOL_MANAGING_WORKERS; /* * The trustee might be waiting to take over the manager @@ -1872,7 +1875,7 @@ __acquires(&gcwq->lock) * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, * wake up another worker; otherwise, clear HIGHPRI_PENDING. */ - if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) { struct work_struct *nwork = list_first_entry(&pool->worklist, struct work_struct, entry); @@ -1880,7 +1883,7 @@ __acquires(&gcwq->lock) get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) wake_up_worker(pool); else - gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + pool->flags &= ~POOL_HIGHPRI_PENDING; } /* @@ -3360,10 +3363,10 @@ static int __cpuinit trustee_thread(void *__gcwq) * cancelled. */ BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS)); BUG_ON(rc < 0); - gcwq->flags |= GCWQ_MANAGING_WORKERS; + gcwq->pool.flags |= POOL_MANAGING_WORKERS; list_for_each_entry(worker, &gcwq->pool.idle_list, entry) worker->flags |= WORKER_ROGUE; @@ -3487,7 +3490,7 @@ static int __cpuinit trustee_thread(void *__gcwq) } /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + gcwq->pool.flags &= ~POOL_MANAGING_WORKERS; /* notify completion */ gcwq->trustee = NULL; @@ -3604,7 +3607,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, spin_unlock_irq(&gcwq->lock); kthread_bind(gcwq->pool.first_idle->task, cpu); spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; + gcwq->pool.flags |= POOL_MANAGE_WORKERS; start_worker(gcwq->pool.first_idle); gcwq->pool.first_idle = NULL; break; -- cgit v1.2.3 From 4ce62e9e30cacc26885cab133ad1de358dd79f21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jul 2012 22:16:44 -0700 Subject: workqueue: introduce NR_WORKER_POOLS and for_each_worker_pool() Introduce NR_WORKER_POOLS and for_each_worker_pool() and convert code paths which need to manipulate all pools in a gcwq to use them. NR_WORKER_POOLS is currently one and for_each_worker_pool() iterates over only @gcwq->pool. Note that nr_running is per-pool property and converted to an array with NR_WORKER_POOLS elements and renamed to pool_nr_running. Note that get_pool_nr_running() currently assumes 0 index. The next patch will make use of non-zero index. The changes in this patch are mechanical and don't caues any functional difference. This is to prepare for multiple pools per gcwq. v2: nr_running indexing bug in get_pool_nr_running() fixed. v3: Pointer to array is stupid. Don't use it in get_pool_nr_running() as suggested by Linus. Signed-off-by: Tejun Heo Cc: Tony Luck Cc: Fengguang Wu Cc: Linus Torvalds --- kernel/workqueue.c | 223 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 153 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7a98bae635fa..b0daaea44eaa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -74,6 +74,8 @@ enum { TRUSTEE_RELEASE = 3, /* release workers */ TRUSTEE_DONE = 4, /* trustee is done */ + NR_WORKER_POOLS = 1, /* # worker pools per gcwq */ + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, @@ -274,6 +276,9 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include +#define for_each_worker_pool(pool, gcwq) \ + for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL) + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) @@ -454,7 +459,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * try_to_wake_up(). Put it in a separate cacheline. */ static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); /* * Global cpu workqueue and nr_running counter for unbound gcwq. The @@ -462,7 +467,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); * workers have WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; -static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ +static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { + [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ +}; static int worker_thread(void *__worker); @@ -477,11 +484,12 @@ static struct global_cwq *get_gcwq(unsigned int cpu) static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->gcwq->cpu; + int idx = 0; if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(gcwq_nr_running, cpu); + return &per_cpu(pool_nr_running, cpu)[idx]; else - return &unbound_gcwq_nr_running; + return &unbound_pool_nr_running[idx]; } static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, @@ -3345,9 +3353,30 @@ EXPORT_SYMBOL_GPL(work_busy); __ret1 < 0 ? -1 : 0; \ }) +static bool gcwq_is_managing_workers(struct global_cwq *gcwq) +{ + struct worker_pool *pool; + + for_each_worker_pool(pool, gcwq) + if (pool->flags & POOL_MANAGING_WORKERS) + return true; + return false; +} + +static bool gcwq_has_idle_workers(struct global_cwq *gcwq) +{ + struct worker_pool *pool; + + for_each_worker_pool(pool, gcwq) + if (!list_empty(&pool->idle_list)) + return true; + return false; +} + static int __cpuinit trustee_thread(void *__gcwq) { struct global_cwq *gcwq = __gcwq; + struct worker_pool *pool; struct worker *worker; struct work_struct *work; struct hlist_node *pos; @@ -3363,13 +3392,15 @@ static int __cpuinit trustee_thread(void *__gcwq) * cancelled. */ BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS)); + rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq)); BUG_ON(rc < 0); - gcwq->pool.flags |= POOL_MANAGING_WORKERS; + for_each_worker_pool(pool, gcwq) { + pool->flags |= POOL_MANAGING_WORKERS; - list_for_each_entry(worker, &gcwq->pool.idle_list, entry) - worker->flags |= WORKER_ROGUE; + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags |= WORKER_ROGUE; + } for_each_busy_worker(worker, i, pos, gcwq) worker->flags |= WORKER_ROGUE; @@ -3390,10 +3421,12 @@ static int __cpuinit trustee_thread(void *__gcwq) * keep_working() are always true as long as the worklist is * not empty. */ - atomic_set(get_pool_nr_running(&gcwq->pool), 0); + for_each_worker_pool(pool, gcwq) + atomic_set(get_pool_nr_running(pool), 0); spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->pool.idle_timer); + for_each_worker_pool(pool, gcwq) + del_timer_sync(&pool->idle_timer); spin_lock_irq(&gcwq->lock); /* @@ -3415,29 +3448,38 @@ static int __cpuinit trustee_thread(void *__gcwq) * may be frozen works in freezable cwqs. Don't declare * completion while frozen. */ - while (gcwq->pool.nr_workers != gcwq->pool.nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { - int nr_works = 0; + while (true) { + bool busy = false; - list_for_each_entry(work, &gcwq->pool.worklist, entry) { - send_mayday(work); - nr_works++; - } + for_each_worker_pool(pool, gcwq) + busy |= pool->nr_workers != pool->nr_idle; - list_for_each_entry(worker, &gcwq->pool.idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } + if (!busy && !(gcwq->flags & GCWQ_FREEZING) && + gcwq->trustee_state != TRUSTEE_IN_CHARGE) + break; - if (need_to_create_worker(&gcwq->pool)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(&gcwq->pool, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); + for_each_worker_pool(pool, gcwq) { + int nr_works = 0; + + list_for_each_entry(work, &pool->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &pool->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(pool)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(pool, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } } } @@ -3452,11 +3494,18 @@ static int __cpuinit trustee_thread(void *__gcwq) * all workers till we're canceled. */ do { - rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list)); - while (!list_empty(&gcwq->pool.idle_list)) - destroy_worker(list_first_entry(&gcwq->pool.idle_list, - struct worker, entry)); - } while (gcwq->pool.nr_workers && rc >= 0); + rc = trustee_wait_event(gcwq_has_idle_workers(gcwq)); + + i = 0; + for_each_worker_pool(pool, gcwq) { + while (!list_empty(&pool->idle_list)) { + worker = list_first_entry(&pool->idle_list, + struct worker, entry); + destroy_worker(worker); + } + i |= pool->nr_workers; + } + } while (i && rc >= 0); /* * At this point, either draining has completed and no worker @@ -3465,7 +3514,8 @@ static int __cpuinit trustee_thread(void *__gcwq) * Tell the remaining busy ones to rebind once it finishes the * currently scheduled works by scheduling the rebind_work. */ - WARN_ON(!list_empty(&gcwq->pool.idle_list)); + for_each_worker_pool(pool, gcwq) + WARN_ON(!list_empty(&pool->idle_list)); for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; @@ -3490,7 +3540,8 @@ static int __cpuinit trustee_thread(void *__gcwq) } /* relinquish manager role */ - gcwq->pool.flags &= ~POOL_MANAGING_WORKERS; + for_each_worker_pool(pool, gcwq) + pool->flags &= ~POOL_MANAGING_WORKERS; /* notify completion */ gcwq->trustee = NULL; @@ -3532,8 +3583,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); + struct worker *new_workers[NR_WORKER_POOLS] = { }; + struct worker_pool *pool; unsigned long flags; + int i; action &= ~CPU_TASKS_FROZEN; @@ -3546,12 +3599,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, kthread_bind(new_trustee, cpu); /* fall through */ case CPU_UP_PREPARE: - BUG_ON(gcwq->pool.first_idle); - new_worker = create_worker(&gcwq->pool, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; + i = 0; + for_each_worker_pool(pool, gcwq) { + BUG_ON(pool->first_idle); + new_workers[i] = create_worker(pool, false); + if (!new_workers[i++]) + goto err_destroy; } } @@ -3568,8 +3621,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); /* fall through */ case CPU_UP_PREPARE: - BUG_ON(gcwq->pool.first_idle); - gcwq->pool.first_idle = new_worker; + i = 0; + for_each_worker_pool(pool, gcwq) { + BUG_ON(pool->first_idle); + pool->first_idle = new_workers[i++]; + } break; case CPU_DYING: @@ -3586,8 +3642,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, gcwq->trustee_state = TRUSTEE_BUTCHER; /* fall through */ case CPU_UP_CANCELED: - destroy_worker(gcwq->pool.first_idle); - gcwq->pool.first_idle = NULL; + for_each_worker_pool(pool, gcwq) { + destroy_worker(pool->first_idle); + pool->first_idle = NULL; + } break; case CPU_DOWN_FAILED: @@ -3604,18 +3662,32 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, * Put the first_idle in and request a real manager to * take a look. */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->pool.first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->pool.flags |= POOL_MANAGE_WORKERS; - start_worker(gcwq->pool.first_idle); - gcwq->pool.first_idle = NULL; + for_each_worker_pool(pool, gcwq) { + spin_unlock_irq(&gcwq->lock); + kthread_bind(pool->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + pool->flags |= POOL_MANAGE_WORKERS; + start_worker(pool->first_idle); + pool->first_idle = NULL; + } break; } spin_unlock_irqrestore(&gcwq->lock, flags); return notifier_from_errno(0); + +err_destroy: + if (new_trustee) + kthread_stop(new_trustee); + + spin_lock_irqsave(&gcwq->lock, flags); + for (i = 0; i < NR_WORKER_POOLS; i++) + if (new_workers[i]) + destroy_worker(new_workers[i]); + spin_unlock_irqrestore(&gcwq->lock, flags); + + return NOTIFY_BAD; } #ifdef CONFIG_SMP @@ -3774,6 +3846,7 @@ void thaw_workqueues(void) for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; struct workqueue_struct *wq; spin_lock_irq(&gcwq->lock); @@ -3795,7 +3868,8 @@ void thaw_workqueues(void) cwq_activate_first_delayed(cwq); } - wake_up_worker(&gcwq->pool); + for_each_worker_pool(pool, gcwq) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -3816,25 +3890,29 @@ static int __init init_workqueues(void) /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; spin_lock_init(&gcwq->lock); - gcwq->pool.gcwq = gcwq; - INIT_LIST_HEAD(&gcwq->pool.worklist); gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - INIT_LIST_HEAD(&gcwq->pool.idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - init_timer_deferrable(&gcwq->pool.idle_timer); - gcwq->pool.idle_timer.function = idle_worker_timeout; - gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool; + for_each_worker_pool(pool, gcwq) { + pool->gcwq = gcwq; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); - setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout, - (unsigned long)&gcwq->pool); + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - ida_init(&gcwq->pool.worker_ida); + setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + (unsigned long)pool); + + ida_init(&pool->worker_ida); + } gcwq->trustee_state = TRUSTEE_DONE; init_waitqueue_head(&gcwq->trustee_wait); @@ -3843,15 +3921,20 @@ static int __init init_workqueues(void) /* create the initial worker */ for_each_online_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; + struct worker_pool *pool; if (cpu != WORK_CPU_UNBOUND) gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(&gcwq->pool, true); - BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - spin_unlock_irq(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) { + struct worker *worker; + + worker = create_worker(pool, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } } system_wq = alloc_workqueue("events", 0, 0); -- cgit v1.2.3 From 3270476a6c0ce322354df8679652f060d66526dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jul 2012 22:16:45 -0700 Subject: workqueue: reimplement WQ_HIGHPRI using a separate worker_pool WQ_HIGHPRI was implemented by queueing highpri work items at the head of the global worklist. Other than queueing at the head, they weren't handled differently; unfortunately, this could lead to execution latency of a few seconds on heavily loaded systems. Now that workqueue code has been updated to deal with multiple worker_pools per global_cwq, this patch reimplements WQ_HIGHPRI using a separate worker_pool. NR_WORKER_POOLS is bumped to two and gcwq->pools[0] is used for normal pri work items and ->pools[1] for highpri. Highpri workers get -20 nice level and has 'H' suffix in their names. Note that this change increases the number of kworkers per cpu. POOL_HIGHPRI_PENDING, pool_determine_ins_pos() and highpri chain wakeup code in process_one_work() are no longer used and removed. This allows proper prioritization of highpri work items and removes high execution latency of highpri work items. v2: nr_running indexing bug in get_pool_nr_running() fixed. v3: Refreshed for the get_pool_nr_running() update in the previous patch. Signed-off-by: Tejun Heo Reported-by: Josh Hunt LKML-Reference: Cc: Tony Luck Cc: Fengguang Wu --- kernel/workqueue.c | 100 +++++++++++++++-------------------------------------- 1 file changed, 27 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b0daaea44eaa..4fa9e3552f1e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,7 +52,6 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ - POOL_HIGHPRI_PENDING = 1 << 2, /* highpri works on queue */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -74,7 +73,7 @@ enum { TRUSTEE_RELEASE = 3, /* release workers */ TRUSTEE_DONE = 4, /* trustee is done */ - NR_WORKER_POOLS = 1, /* # worker pools per gcwq */ + NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -95,6 +94,7 @@ enum { * all cpus. Give -20. */ RESCUER_NICE_LEVEL = -20, + HIGHPRI_NICE_LEVEL = -20, }; /* @@ -174,7 +174,7 @@ struct global_cwq { struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct worker_pool pool; /* the worker pools */ + struct worker_pool pools[2]; /* normal and highpri pools */ struct task_struct *trustee; /* L: for gcwq shutdown */ unsigned int trustee_state; /* L: trustee state */ @@ -277,7 +277,8 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #include #define for_each_worker_pool(pool, gcwq) \ - for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL) + for ((pool) = &(gcwq)->pools[0]; \ + (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ @@ -473,6 +474,11 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { static int worker_thread(void *__worker); +static int worker_pool_pri(struct worker_pool *pool) +{ + return pool - pool->gcwq->pools; +} + static struct global_cwq *get_gcwq(unsigned int cpu) { if (cpu != WORK_CPU_UNBOUND) @@ -484,7 +490,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu) static atomic_t *get_pool_nr_running(struct worker_pool *pool) { int cpu = pool->gcwq->cpu; - int idx = 0; + int idx = worker_pool_pri(pool); if (cpu != WORK_CPU_UNBOUND) return &per_cpu(pool_nr_running, cpu)[idx]; @@ -586,15 +592,14 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) } /* - * Policy functions. These define the policies on how the global - * worker pool is managed. Unless noted otherwise, these functions - * assume that they're being called with gcwq->lock held. + * Policy functions. These define the policies on how the global worker + * pools are managed. Unless noted otherwise, these functions assume that + * they're being called with gcwq->lock held. */ static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_pool_nr_running(pool)) || - (pool->flags & POOL_HIGHPRI_PENDING); + return !atomic_read(get_pool_nr_running(pool)); } /* @@ -621,9 +626,7 @@ static bool keep_working(struct worker_pool *pool) { atomic_t *nr_running = get_pool_nr_running(pool); - return !list_empty(&pool->worklist) && - (atomic_read(nr_running) <= 1 || - (pool->flags & POOL_HIGHPRI_PENDING)); + return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ @@ -891,43 +894,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, work); } -/** - * pool_determine_ins_pos - find insertion position - * @pool: pool of interest - * @cwq: cwq a work is being queued for - * - * A work for @cwq is about to be queued on @pool, determine insertion - * position for the work. If @cwq is for HIGHPRI wq, the work is - * queued at the head of the queue but in FIFO order with respect to - * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that - * there are HIGHPRI works pending. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to inserstion position. - */ -static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool, - struct cpu_workqueue_struct *cwq) -{ - struct work_struct *twork; - - if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &pool->worklist; - - list_for_each_entry(twork, &pool->worklist, entry) { - struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); - - if (!(tcwq->wq->flags & WQ_HIGHPRI)) - break; - } - - pool->flags |= POOL_HIGHPRI_PENDING; - return &twork->entry; -} - /** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to @@ -1068,7 +1034,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (likely(cwq->nr_active < cwq->max_active)) { trace_workqueue_activate_work(work); cwq->nr_active++; - worklist = pool_determine_ins_pos(cwq->pool, cwq); + worklist = &cwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; @@ -1385,6 +1351,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) { struct global_cwq *gcwq = pool->gcwq; bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1406,15 +1373,17 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) if (!on_unbound_cpu) worker->task = kthread_create_on_node(worker_thread, - worker, - cpu_to_node(gcwq->cpu), - "kworker/%u:%d", gcwq->cpu, id); + worker, cpu_to_node(gcwq->cpu), + "kworker/%u:%d%s", gcwq->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d", id); + "kworker/u:%d%s", id, pri); if (IS_ERR(worker->task)) goto fail; + if (worker_pool_pri(pool)) + set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* * A rogue worker will become a regular one if CPU comes * online later on. Make sure every worker has @@ -1761,10 +1730,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq); trace_workqueue_activate_work(work); - move_linked_works(work, pos, NULL); + move_linked_works(work, &cwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1879,21 +1847,6 @@ __acquires(&gcwq->lock) set_work_cpu(work, gcwq->cpu); list_del_init(&work->entry); - /* - * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, - * wake up another worker; otherwise, clear HIGHPRI_PENDING. - */ - if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&pool->worklist, - struct work_struct, entry); - - if (!list_empty(&pool->worklist) && - get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(pool); - else - pool->flags &= ~POOL_HIGHPRI_PENDING; - } - /* * CPU intensive works don't participate in concurrency * management. They're the scheduler's responsibility. @@ -3047,9 +3000,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct global_cwq *gcwq = get_gcwq(cpu); + int pool_idx = (bool)(flags & WQ_HIGHPRI); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->pool = &gcwq->pool; + cwq->pool = &gcwq->pools[pool_idx]; cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; -- cgit v1.2.3 From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 17:13:09 -0400 Subject: stop passing nameidata to ->lookup() Just the flags; only NFS cares even about that, but there are legitimate uses for such argument. And getting rid of that completely would require splitting ->lookup() into a couple of methods (at least), so let's leave that alone for now... Signed-off-by: Al Viro --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b303dfc7dce0..0cd1314acdaf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -2570,7 +2570,7 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); -- cgit v1.2.3 From 79714f72d3b964611997de512cb29198c9f2dbbb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Jun 2012 03:01:42 +0400 Subject: get rid of kern_path_parent() all callers want the same thing, actually - a kinda-sorta analog of kern_path_create(). I.e. they want parent vfsmount/dentry (with ->i_mutex held, to make sure the child dentry is still their child) + the child dentry. Signed-off-by Al Viro --- kernel/audit_watch.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e683869365d9..3823281401b5 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata nd; - struct dentry *d; - int err; - - err = kern_path_parent(watch->path, &nd); - if (err) - return err; - - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - return -EINVAL; - } - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); - if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + struct dentry *d = kern_path_locked(watch->path, parent); + if (IS_ERR(d)) return PTR_ERR(d); - } + mutex_unlock(&parent->dentry->d_inode->i_mutex); if (d->d_inode) { /* update watch filter fields */ watch->dev = d->d_inode->i_sb->s_dev; watch->ino = d->d_inode->i_ino; } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - - *parent = nd.path; dput(d); return 0; } -- cgit v1.2.3 From be34d1a3bc4b6f357a49acb55ae870c81337e4f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Jun 2012 12:55:18 +0100 Subject: VFS: Make clone_mnt()/copy_tree()/collect_mounts() return errors copy_tree() can theoretically fail in a case other than ENOMEM, but always returns NULL which is interpreted by callers as -ENOMEM. Change it to return an explicit error. Also change clone_mnt() for consistency and because union mounts will add new error cases. Thanks to Andreas Gruenbacher for a bug fix. [AV: folded braino fix by Dan Carpenter] Original-author: Valerie Aurora Signed-off-by: David Howells Cc: Valerie Aurora Cc: Andreas Gruenbacher Signed-off-by: Al Viro --- kernel/audit_tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5bf0790497e7..3a5ca582ba1e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -595,7 +595,7 @@ void audit_trim_trees(void) root_mnt = collect_mounts(&path); path_put(&path); - if (!root_mnt) + if (IS_ERR(root_mnt)) goto skip_it; spin_lock(&hash_lock); @@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule) goto Err; mnt = collect_mounts(&path); path_put(&path); - if (!mnt) { - err = -ENOMEM; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto Err; } @@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new) return err; tagged = collect_mounts(&path2); path_put(&path2); - if (!tagged) - return -ENOMEM; + if (IS_ERR(tagged)) + return PTR_ERR(tagged); err = kern_path(old, 0, &path1); if (err) { -- cgit v1.2.3 From 9249e17fe094d853d1ef7475dd559a2cc7e23d42 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Jun 2012 12:55:37 +0100 Subject: VFS: Pass mount flags to sget() Pass mount flags to sget() so that it can use them in initialising a new superblock before the set function is called. They could also be passed to the compare function. Signed-off-by: David Howells Signed-off-by: Al Viro --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0cd1314acdaf..af2b5641fc8b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1587,7 +1587,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, opts.new_root = new_root; /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); -- cgit v1.2.3 From 6b1859dba01c7d512b72d77e3fd7da8354235189 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:50 -0400 Subject: ntp: Fix STA_INS/DEL clearing bug In commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d, I introduced a bug that kept the STA_INS or STA_DEL bit from being cleared from time_status via adjtimex() without forcing STA_PLL first. Usually once the STA_INS is set, it isn't cleared until the leap second is applied, so its unlikely this affected anyone. However during testing I noticed it took some effort to cancel a leap second once STA_INS was set. Signed-off-by: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava CC: stable@vger.kernel.org # 3.4 Link: http://lkml.kernel.org/r/1342156917-25092-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; -- cgit v1.2.3 From 42e71e81f5bb5125ca7c194b5ccf1c93511ff8fb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:51 -0400 Subject: time: Whitespace cleanups per Ingo%27s requests Ingo noted a number of places where there is inconsistent use of whitespace. This patch tries to address the main culprits. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 269b1fe5f2ae..c2f12aa87fce 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,32 +24,31 @@ /* Structure holding internal timekeeping values. */ struct timekeeper { /* Current clocksource used for timekeeping. */ - struct clocksource *clock; + struct clocksource *clock; /* NTP adjusted clock multiplier */ - u32 mult; + u32 mult; /* The shift value of the current clocksource. */ - int shift; - + int shift; /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; + cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; + u64 xtime_interval; /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; + u32 raw_interval; /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ - u64 xtime_nsec; + u64 xtime_nsec; /* Difference between accumulated time and NTP time in ntp * shifted nano seconds. */ - s64 ntp_error; + s64 ntp_error; /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ - int ntp_error_shift; + int ntp_error_shift; /* The current time */ - struct timespec xtime; + struct timespec xtime; /* * wall_to_monotonic is what we need to add to xtime (or xtime corrected * for sub jiffie times) to get to monotonic time. Monotonic is pegged @@ -64,20 +63,17 @@ struct timekeeper { * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ - struct timespec wall_to_monotonic; + struct timespec wall_to_monotonic; /* time spent in suspend */ - struct timespec total_sleep_time; + struct timespec total_sleep_time; /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - + struct timespec raw_time; /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - + ktime_t offs_real; /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - + ktime_t offs_boot; /* Seqlock for all timekeeper values */ - seqlock_t lock; + seqlock_t lock; }; static struct timekeeper timekeeper; @@ -547,6 +543,7 @@ u64 timekeeping_max_deferment(void) { unsigned long seq; u64 ret; + do { seq = read_seqbegin(&timekeeper.lock); -- cgit v1.2.3 From fee84c43e6afc42295ae8058cbbef9ea5633926c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:52 -0400 Subject: time: Explicitly use u32 instead of int for shift values Ingo noted that using a u32 instead of int for shift values would be better to make sure the compiler doesn't unnecessarily use complex signed arithmetic. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c2f12aa87fce..4fd83df0b14d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -28,7 +28,7 @@ struct timekeeper { /* NTP adjusted clock multiplier */ u32 mult; /* The shift value of the current clocksource. */ - int shift; + u32 shift; /* Number of clock cycles in one NTP interval. */ cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ @@ -45,7 +45,7 @@ struct timekeeper { s64 ntp_error; /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ - int ntp_error_shift; + u32 ntp_error_shift; /* The current time */ struct timespec xtime; @@ -960,7 +960,7 @@ static void timekeeping_adjust(s64 offset) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift) { u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; -- cgit v1.2.3 From 1e75fa8be9fb61e1af46b5b3b176347a4c958ca1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:53 -0400 Subject: time: Condense timekeeper.xtime into xtime_sec The timekeeper struct has a xtime_nsec, which keeps the sub-nanosecond remainder. This ends up being somewhat duplicative of the timekeeper.xtime.tv_nsec value, and we have to do extra work to keep them apart, copying the full nsec portion out and back in over and over. This patch simplifies some of the logic by taking the timekeeper xtime value and splitting it into timekeeper.xtime_sec and reuses the timekeeper.xtime_nsec for the sub-second portion (stored in higher res shifted nanoseconds). This simplifies some of the accumulation logic. And will allow for more accurate timekeeping once the vsyscall code is updated to use the shifted nanosecond remainder. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 181 ++++++++++++++++++++++++++++------------------ 1 file changed, 110 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4fd83df0b14d..b98d9bd73e5e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -38,8 +38,11 @@ struct timekeeper { /* Raw nano seconds accumulated per NTP interval. */ u32 raw_interval; - /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + /* Current CLOCK_REALTIME time in seconds */ + u64 xtime_sec; + /* Clock shifted nano seconds */ u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp * shifted nano seconds. */ s64 ntp_error; @@ -47,8 +50,6 @@ struct timekeeper { * ntp shifted nano seconds. */ u32 ntp_error_shift; - /* The current time */ - struct timespec xtime; /* * wall_to_monotonic is what we need to add to xtime (or xtime corrected * for sub jiffie times) to get to monotonic time. Monotonic is pegged @@ -84,11 +85,37 @@ static struct timekeeper timekeeper; */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { + tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + tk->xtime_sec++; + } +} + +static struct timespec tk_xtime(struct timekeeper *tk) +{ + struct timespec ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + return ts; +} +static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->xtime_nsec = ts->tv_nsec << tk->shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->xtime_nsec += ts->tv_nsec << tk->shift; +} /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -104,7 +131,9 @@ static void timekeeper_setup_internals(struct clocksource *clock) { cycle_t interval; u64 tmp, ntpinterval; + struct clocksource *old_clock; + old_clock = timekeeper.clock; timekeeper.clock = clock; clock->cycle_last = clock->read(clock); @@ -126,7 +155,14 @@ static void timekeeper_setup_internals(struct clocksource *clock) timekeeper.raw_interval = ((u64) interval * clock->mult) >> clock->shift; - timekeeper.xtime_nsec = 0; + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) + timekeeper.xtime_nsec >>= -shift_change; + else + timekeeper.xtime_nsec <<= shift_change; + } timekeeper.shift = clock->shift; timekeeper.ntp_error = 0; @@ -145,6 +181,7 @@ static inline s64 timekeeping_get_ns(void) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ clock = timekeeper.clock; @@ -153,9 +190,8 @@ static inline s64 timekeeping_get_ns(void) /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec; + return nsec >> timekeeper.shift; } static inline s64 timekeeping_get_ns_raw(void) @@ -185,12 +221,15 @@ static void update_rt_offset(void) /* must hold write on timekeeper.lock */ static void timekeeping_update(bool clearntp) { + struct timespec xt; + if (clearntp) { timekeeper.ntp_error = 0; ntp_clear(); } update_rt_offset(); - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + xt = tk_xtime(&timekeeper); + update_vsyscall(&xt, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -213,13 +252,12 @@ static void timekeeping_forward_now(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + timekeeper.xtime_nsec += cycle_delta * timekeeper.mult; /* If arch requires, add in gettimeoffset() */ - nsec += arch_gettimeoffset(); + timekeeper.xtime_nsec += arch_gettimeoffset() << timekeeper.shift; - timespec_add_ns(&timekeeper.xtime, nsec); + tk_normalize_xtime(&timekeeper); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); timespec_add_ns(&timekeeper.raw_time, nsec); @@ -234,15 +272,15 @@ static void timekeeping_forward_now(void) void getnstimeofday(struct timespec *ts) { unsigned long seq; - s64 nsecs; + s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; - nsecs = timekeeping_get_ns(); + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -262,11 +300,10 @@ ktime_t ktime_get(void) do { seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime.tv_sec + + secs = timekeeper.xtime_sec + timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeper.xtime.tv_nsec + + nsecs = timekeeping_get_ns() + timekeeper.wall_to_monotonic.tv_nsec; - nsecs += timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -291,22 +328,21 @@ void ktime_get_ts(struct timespec *ts) { struct timespec tomono; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(); tomono = timekeeper.wall_to_monotonic; - nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); + ts->tv_nsec += arch_gettimeoffset(); } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -334,7 +370,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) seq = read_seqbegin(&timekeeper.lock); *ts_raw = timekeeper.raw_time; - *ts_real = timekeeper.xtime; + ts_real->tv_sec = timekeeper.xtime_sec; + ts_real->tv_nsec = 0; nsecs_raw = timekeeping_get_ns_raw(); nsecs_real = timekeeping_get_ns(); @@ -377,7 +414,7 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timespec ts_delta; + struct timespec ts_delta, xt; unsigned long flags; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) @@ -387,12 +424,15 @@ int do_settimeofday(const struct timespec *tv) timekeeping_forward_now(); - ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; + xt = tk_xtime(&timekeeper); + ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; + timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, ts_delta); - timekeeper.xtime = *tv; + tk_set_xtime(&timekeeper, tv); + timekeeping_update(true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -422,7 +462,8 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(); - timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); + + tk_xtime_add(&timekeeper, ts); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *ts); @@ -606,14 +647,12 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - timekeeper.xtime.tv_sec = now.tv_sec; - timekeeper.xtime.tv_nsec = now.tv_nsec; + tk_set_xtime(&timekeeper, &now); timekeeper.raw_time.tv_sec = 0; timekeeper.raw_time.tv_nsec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = timekeeper.xtime.tv_sec; - boot.tv_nsec = timekeeper.xtime.tv_nsec; - } + if (boot.tv_sec == 0 && boot.tv_nsec == 0) + boot = tk_xtime(&timekeeper); + set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); update_rt_offset(); @@ -646,7 +685,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) return; } - timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); + tk_xtime_add(&timekeeper, delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); @@ -742,7 +781,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -977,9 +1016,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift) while (timekeeper.xtime_nsec >= nsecps) { int leap; timekeeper.xtime_nsec -= nsecps; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; + timekeeper.xtime_sec++; + leap = second_overflow(timekeeper.xtime_sec); + timekeeper.xtime_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; if (leap) clock_was_set_delayed(); @@ -1015,6 +1054,7 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; + s64 remainder; write_seqlock_irqsave(&timekeeper.lock, flags); @@ -1029,8 +1069,6 @@ static void update_wall_time(void) #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -1076,28 +1114,31 @@ static void update_wall_time(void) timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } - /* - * Store full nanoseconds into xtime after rounding it up and - * add the remainder to the error difference. - */ - timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> - timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - timekeeper.ntp_error_shift; + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), this can be killed. + */ + remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); + timekeeper.xtime_nsec -= remainder; + timekeeper.xtime_nsec += 1 << timekeeper.shift; + timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger than NSEC_PER_SEC + * xtime_nsec isn't larger than NSEC_PER_SEC */ - if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + if (unlikely(timekeeper.xtime_nsec >= + ((u64)NSEC_PER_SEC << timekeeper.shift))) { int leap; - timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; + timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.shift; + timekeeper.xtime_sec++; + leap = second_overflow(timekeeper.xtime_sec); + timekeeper.xtime_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; if (leap) clock_was_set_delayed(); @@ -1148,21 +1189,20 @@ void get_monotonic_boottime(struct timespec *ts) { struct timespec tomono, sleep; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(); tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; - nsecs = timekeeping_get_ns(); } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); @@ -1195,13 +1235,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime.tv_sec; + return timekeeper.xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return timekeeper.xtime; + return tk_xtime(&timekeeper); } struct timespec current_kernel_time(void) @@ -1212,7 +1252,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&timekeeper.lock); - now = timekeeper.xtime; + now = tk_xtime(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); return now; @@ -1227,7 +1267,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&timekeeper.lock); - now = timekeeper.xtime; + now = tk_xtime(&timekeeper); mono = timekeeper.wall_to_monotonic; } while (read_seqretry(&timekeeper.lock, seq)); @@ -1262,7 +1302,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, do { seq = read_seqbegin(&timekeeper.lock); - *xtim = timekeeper.xtime; + *xtim = tk_xtime(&timekeeper); *wtom = timekeeper.wall_to_monotonic; *sleep = timekeeper.total_sleep_time; } while (read_seqretry(&timekeeper.lock, seq)); @@ -1286,9 +1326,8 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) do { seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime.tv_sec; - nsecs = timekeeper.xtime.tv_nsec; - nsecs += timekeeping_get_ns(); + secs = timekeeper.xtime_sec; + nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); -- cgit v1.2.3 From 1f4f948706bcec1b51bf6492bf04057d2e21e273 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:54 -0400 Subject: time: Refactor accumulation of nsecs to secs We do the exact same logic moving nsecs to secs in the timekeeper in multiple places, so condense this into a single function. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-6-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 54 ++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b98d9bd73e5e..cb4a433bab97 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -990,6 +990,35 @@ static void timekeeping_adjust(s64 offset) } +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + + while (tk->xtime_nsec >= nsecps) { + int leap; + + tk->xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + tk->xtime_sec += leap; + tk->wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); + + } +} + + /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -1001,7 +1030,6 @@ static void timekeeping_adjust(s64 offset) */ static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; /* If the offset is smaller than a shifted interval, do nothing */ @@ -1013,16 +1041,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift) timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - while (timekeeper.xtime_nsec >= nsecps) { - int leap; - timekeeper.xtime_nsec -= nsecps; - timekeeper.xtime_sec++; - leap = second_overflow(timekeeper.xtime_sec); - timekeeper.xtime_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); - } + + accumulate_nsecs_to_secs(&timekeeper); /* Accumulate raw time */ raw_nsecs = timekeeper.raw_interval << shift; @@ -1132,17 +1152,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - if (unlikely(timekeeper.xtime_nsec >= - ((u64)NSEC_PER_SEC << timekeeper.shift))) { - int leap; - timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.shift; - timekeeper.xtime_sec++; - leap = second_overflow(timekeeper.xtime_sec); - timekeeper.xtime_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); - } + accumulate_nsecs_to_secs(&timekeeper); timekeeping_update(false); -- cgit v1.2.3 From f2a5a0854efc62abe7f69e9947842cb135837f9a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:55 -0400 Subject: time: Move arch_gettimeoffset() usage into timekeeping_get_ns() Since we call arch_gettimeoffset() in all the accessor functions, move arch_gettimeoffset() calls into timekeeping_get_ns() and timekeeping_get_ns_raw() to simplify the code. This also makes the code easier to maintain as we don't have to worry about forgetting the arch_gettimeoffset() as has happened in the past. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-7-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cb4a433bab97..e43289df28c2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -191,13 +191,17 @@ static inline s64 timekeeping_get_ns(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec; - return nsec >> timekeeper.shift; + nsec >>= timekeeper.shift; + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } static inline s64 timekeeping_get_ns_raw(void) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ clock = timekeeper.clock; @@ -206,8 +210,11 @@ static inline s64 timekeeping_get_ns_raw(void) /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds. */ - return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + /* convert delta to nanoseconds. */ + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } static void update_rt_offset(void) @@ -282,9 +289,6 @@ void getnstimeofday(struct timespec *ts) ts->tv_sec = timekeeper.xtime_sec; ts->tv_nsec = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts, nsecs); @@ -304,8 +308,6 @@ ktime_t ktime_get(void) timekeeper.wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns() + timekeeper.wall_to_monotonic.tv_nsec; - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); } while (read_seqretry(&timekeeper.lock, seq)); /* @@ -336,8 +338,6 @@ void ktime_get_ts(struct timespec *ts) ts->tv_sec = timekeeper.xtime_sec; ts->tv_nsec = timekeeping_get_ns(); tomono = timekeeper.wall_to_monotonic; - /* If arch requires, add in gettimeoffset() */ - ts->tv_nsec += arch_gettimeoffset(); } while (read_seqretry(&timekeeper.lock, seq)); @@ -365,8 +365,6 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - u32 arch_offset; - seq = read_seqbegin(&timekeeper.lock); *ts_raw = timekeeper.raw_time; @@ -376,11 +374,6 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(); nsecs_real = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - arch_offset = arch_gettimeoffset(); - nsecs_raw += arch_offset; - nsecs_real += arch_offset; - } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); @@ -1338,8 +1331,6 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) secs = timekeeper.xtime_sec; nsecs = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); *offs_real = timekeeper.offs_real; *offs_boot = timekeeper.offs_boot; -- cgit v1.2.3 From 2a8c0883c3cfffcc148ea606e2a4e7453cd75e73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:56 -0400 Subject: time: Move xtime_nsec adjustment underflow handling timekeeping_adjust When we make adjustments speeding up the clock, its possible for xtime_nsec to underflow. We already handle this properly, but we do so from update_wall_time() instead of the more logical timekeeping_adjust(), where the possible underflow actually occurs. Thus, move the correction logic to the timekeeping_adjust, which is the function that causes the issue. Making update_wall_time() more readable. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-8-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e43289df28c2..aeeaab8cba6e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -980,6 +980,27 @@ static void timekeeping_adjust(s64 offset) timekeeper.xtime_nsec -= offset; timekeeper.ntp_error -= (interval - offset) << timekeeper.ntp_error_shift; + + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we already accumulated the second, cannot simply roll + * the accumulated second back, since the NTP subsystem has been + * notified via second_overflow. So instead we push xtime_nsec forward + * by the amount we underflowed, and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; + } + } @@ -1105,27 +1126,6 @@ static void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(offset); - /* - * Since in the loop above, we accumulate any amount of time - * in xtime_nsec over a second into xtime.tv_sec, its possible for - * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in timekeeping_adjust(), - * its possible the required corrective factor to xtime_nsec could - * cause it to underflow. - * - * Now, we cannot simply roll the accumulated second back, since - * the NTP subsystem has been notified via second_overflow. So - * instead we push xtime_nsec forward by the amount we underflowed, - * and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. - */ - if (unlikely((s64)timekeeper.xtime_nsec < 0)) { - s64 neg = -(s64)timekeeper.xtime_nsec; - timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; - } /* * Store only full nanoseconds into xtime_nsec after rounding -- cgit v1.2.3 From f726a697d06102e7a1fc0a87308cb30a84580205 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:57 -0400 Subject: time: Rework timekeeping functions to take timekeeper ptr as argument As part of cleaning up the timekeeping code, this patch converts a number of internal functions to takei a timekeeper ptr as an argument, so that the internal functions don't access the global timekeeper structure directly. This allows for further optimizations to reduce lock hold time later. This patch has been updated to include more consistent usage of the timekeeper value, by making sure it is always passed as a argument to non top-level functions. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-9-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 208 +++++++++++++++++++++++----------------------- 1 file changed, 103 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index aeeaab8cba6e..5980e902978c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -127,14 +127,14 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) * * Unless you're the timekeeping code, you should not be using this! */ -static void timekeeper_setup_internals(struct clocksource *clock) +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { cycle_t interval; u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = timekeeper.clock; - timekeeper.clock = clock; + old_clock = tk->clock; + tk->clock = clock; clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ @@ -147,64 +147,64 @@ static void timekeeper_setup_internals(struct clocksource *clock) tmp = 1; interval = (cycle_t) tmp; - timekeeper.cycle_interval = interval; + tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - timekeeper.xtime_interval = (u64) interval * clock->mult; - timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; - timekeeper.raw_interval = + tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = ((u64) interval * clock->mult) >> clock->shift; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - timekeeper.xtime_nsec >>= -shift_change; + tk->xtime_nsec >>= -shift_change; else - timekeeper.xtime_nsec <<= shift_change; + tk->xtime_nsec <<= shift_change; } - timekeeper.shift = clock->shift; + tk->shift = clock->shift; - timekeeper.ntp_error = 0; - timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - timekeeper.mult = clock->mult; + tk->mult = clock->mult; } /* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) +static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec; - nsec >>= timekeeper.shift; + nsec = cycle_delta * tk->mult + tk->xtime_nsec; + nsec >>= tk->shift; /* If arch requires, add in gettimeoffset() */ return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(void) +static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ @@ -217,27 +217,26 @@ static inline s64 timekeeping_get_ns_raw(void) return nsec + arch_gettimeoffset(); } -static void update_rt_offset(void) +static void update_rt_offset(struct timekeeper *tk) { - struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + struct timespec tmp, *wtm = &tk->wall_to_monotonic; set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); - timekeeper.offs_real = timespec_to_ktime(tmp); + tk->offs_real = timespec_to_ktime(tmp); } /* must hold write on timekeeper.lock */ -static void timekeeping_update(bool clearntp) +static void timekeeping_update(struct timekeeper *tk, bool clearntp) { struct timespec xt; if (clearntp) { - timekeeper.ntp_error = 0; + tk->ntp_error = 0; ntp_clear(); } - update_rt_offset(); - xt = tk_xtime(&timekeeper); - update_vsyscall(&xt, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + update_rt_offset(tk); + xt = tk_xtime(tk); + update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } @@ -248,26 +247,26 @@ static void timekeeping_update(bool clearntp) * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(void) +static void timekeeping_forward_now(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - timekeeper.xtime_nsec += cycle_delta * timekeeper.mult; + tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - timekeeper.xtime_nsec += arch_gettimeoffset() << timekeeper.shift; + tk->xtime_nsec += arch_gettimeoffset() << tk->shift; - tk_normalize_xtime(&timekeeper); + tk_normalize_xtime(tk); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&timekeeper.raw_time, nsec); + timespec_add_ns(&tk->raw_time, nsec); } /** @@ -287,7 +286,7 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&timekeeper.lock); ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(); + ts->tv_nsec = timekeeping_get_ns(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); @@ -306,7 +305,7 @@ ktime_t ktime_get(void) seq = read_seqbegin(&timekeeper.lock); secs = timekeeper.xtime_sec + timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns() + + nsecs = timekeeping_get_ns(&timekeeper) + timekeeper.wall_to_monotonic.tv_nsec; } while (read_seqretry(&timekeeper.lock, seq)); @@ -336,7 +335,7 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&timekeeper.lock); ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(); + ts->tv_nsec = timekeeping_get_ns(&timekeeper); tomono = timekeeper.wall_to_monotonic; } while (read_seqretry(&timekeeper.lock, seq)); @@ -371,8 +370,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) ts_real->tv_sec = timekeeper.xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(); - nsecs_real = timekeeping_get_ns(); + nsecs_raw = timekeeping_get_ns_raw(&timekeeper); + nsecs_real = timekeeping_get_ns(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); @@ -415,7 +414,7 @@ int do_settimeofday(const struct timespec *tv) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); xt = tk_xtime(&timekeeper); ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; @@ -426,7 +425,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(&timekeeper, tv); - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -453,14 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); tk_xtime_add(&timekeeper, ts); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *ts); - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -485,14 +484,14 @@ static int change_clocksource(void *data) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); if (!new->enable || new->enable(new) == 0) { old = timekeeper.clock; - timekeeper_setup_internals(new); + tk_setup_internals(&timekeeper, new); if (old->disable) old->disable(old); } - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -542,7 +541,7 @@ void getrawmonotonic(struct timespec *ts) do { seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(); + nsecs = timekeeping_get_ns_raw(&timekeeper); *ts = timekeeper.raw_time; } while (read_seqretry(&timekeeper.lock, seq)); @@ -638,7 +637,7 @@ void __init timekeeping_init(void) clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - timekeeper_setup_internals(clock); + tk_setup_internals(&timekeeper, clock); tk_set_xtime(&timekeeper, &now); timekeeper.raw_time.tv_sec = 0; @@ -648,7 +647,7 @@ void __init timekeeping_init(void) set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_rt_offset(); + update_rt_offset(&timekeeper); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -670,7 +669,8 @@ static void update_sleep_time(struct timespec t) * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ -static void __timekeeping_inject_sleeptime(struct timespec *delta) +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + struct timespec *delta) { if (!timespec_valid(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " @@ -678,10 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) return; } - tk_xtime_add(&timekeeper, delta); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *delta); - update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); + tk_xtime_add(tk, delta); + tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); + update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); } @@ -707,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); - __timekeeping_inject_sleeptime(delta); + __timekeeping_inject_sleeptime(&timekeeper, delta); - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -740,7 +739,7 @@ static void timekeeping_resume(void) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&ts); + __timekeeping_inject_sleeptime(&timekeeper, &ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -765,7 +764,7 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); timekeeping_suspended = 1; /* @@ -813,7 +812,8 @@ device_initcall(timekeeping_init_ops); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, + s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -829,7 +829,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -838,8 +838,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); - tick_error -= timekeeper.xtime_interval >> 1; + tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); + tick_error -= tk->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -864,9 +864,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void timekeeping_adjust(s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 error, interval = timekeeper.cycle_interval; + s64 error, interval = tk->cycle_interval; int adj; /* @@ -882,7 +882,7 @@ static void timekeeping_adjust(s64 offset) * * Note: It does not "save" on aggravation when reading the code. */ - error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); + error = tk->ntp_error >> (tk->ntp_error_shift - 1); if (error > interval) { /* * We now divide error by 4(via shift), which checks if @@ -904,7 +904,8 @@ static void timekeeping_adjust(s64 offset) if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(tk, error, &interval, + &offset); } else if (error < -interval) { /* See comment above, this is just switched for the negative */ error >>= 2; @@ -913,18 +914,17 @@ static void timekeeping_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else /* No adjustment needed */ + adj = timekeeping_bigadjust(tk, error, &interval, + &offset); + } else return; - if (unlikely(timekeeper.clock->maxadj && - (timekeeper.mult + adj > - timekeeper.clock->mult + timekeeper.clock->maxadj))) { + if (unlikely(tk->clock->maxadj && + (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - timekeeper.clock->name, (long)timekeeper.mult + adj, - (long)timekeeper.clock->mult + - timekeeper.clock->maxadj); + tk->clock->name, (long)tk->mult + adj, + (long)tk->clock->mult + tk->clock->maxadj); } /* * So the following can be confusing. @@ -975,11 +975,10 @@ static void timekeeping_adjust(s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - timekeeper.mult += adj; - timekeeper.xtime_interval += interval; - timekeeper.xtime_nsec -= offset; - timekeeper.ntp_error -= (interval - offset) << - timekeeper.ntp_error_shift; + tk->mult += adj; + tk->xtime_interval += interval; + tk->xtime_nsec -= offset; + tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; /* * It may be possible that when we entered this function, xtime_nsec @@ -995,10 +994,10 @@ static void timekeeping_adjust(s64 offset) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)timekeeper.xtime_nsec < 0)) { - s64 neg = -(s64)timekeeper.xtime_nsec; - timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; + if (unlikely((s64)tk->xtime_nsec < 0)) { + s64 neg = -(s64)tk->xtime_nsec; + tk->xtime_nsec = 0; + tk->ntp_error += neg << tk->ntp_error_shift; } } @@ -1042,37 +1041,36 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift) +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, + u32 shift) { u64 raw_nsecs; - /* If the offset is smaller than a shifted interval, do nothing */ - if (offset < timekeeper.cycle_interval<cycle_interval<cycle_last += timekeeper.cycle_interval << shift; + offset -= tk->cycle_interval << shift; + tk->clock->cycle_last += tk->cycle_interval << shift; - timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - - accumulate_nsecs_to_secs(&timekeeper); + tk->xtime_nsec += tk->xtime_interval << shift; + accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += timekeeper.raw_time.tv_nsec; + raw_nsecs = tk->raw_interval << shift; + raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - timekeeper.raw_time.tv_sec += raw_secs; + tk->raw_time.tv_sec += raw_secs; } - timekeeper.raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += ntp_tick_length() << shift; - timekeeper.ntp_error -= - (timekeeper.xtime_interval + timekeeper.xtime_remainder) << - (timekeeper.ntp_error_shift + shift); + tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); return offset; } @@ -1118,13 +1116,13 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(offset, shift); + offset = logarithmic_accumulation(&timekeeper, offset, shift); if(offset < timekeeper.cycle_interval<tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(); + ts->tv_nsec = timekeeping_get_ns(&timekeeper); tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; @@ -1330,7 +1328,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) seq = read_seqbegin(&timekeeper.lock); secs = timekeeper.xtime_sec; - nsecs = timekeeping_get_ns(); + nsecs = timekeeping_get_ns(&timekeeper); *offs_real = timekeeper.offs_real; *offs_boot = timekeeper.offs_boot; -- cgit v1.2.3 From 3e997130bd2e8c6f5aaa49d6e3161d4d29b43ab0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 12:50:42 -0400 Subject: timekeeping: Add missing update call in timekeeping_resume() The leap second rework unearthed another issue of inconsistent data. On timekeeping_resume() the timekeeper data is updated, but nothing calls timekeeping_update(), so now the update code in the timer interrupt sees stale values. This has been the case before those changes, but then the timer interrupt was using stale data as well so this went unnoticed for quite some time. Add the missing update call, so all the data is consistent everywhere. Reported-by: Andreas Schwab Reported-and-tested-by: "Rafael J. Wysocki" Reported-and-tested-by: Martin Steigerwald Cc: LKML Cc: Linux PM list Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra , Cc: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 269b1fe5f2ae..3447cfaf11e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -717,6 +717,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; + timekeeping_update(false); write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); -- cgit v1.2.3 From 70498253186586e5dca7bc3ebd3415203b059fbc Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:29 -0700 Subject: kmsg - properly print over-long continuation lines Reserve PREFIX_MAX bytes in the LOG_LINE_MAX line when buffering a continuation line, to be able to properly prefix the LOG_LINE_MAX line with the syslog prefix and timestamp when printing it. Reported-By: Dave Jones Signed-off-by: Kay Sievers Cc: stable Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 177fa49357a5..d87ca5c6a989 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -235,7 +235,8 @@ static u32 log_next_idx; static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -876,7 +877,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; if (prefix) @@ -907,7 +908,7 @@ static int syslog_print(char __user *buf, int size) struct log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -930,7 +931,8 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -969,7 +971,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1022,7 +1024,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; @@ -1367,15 +1370,15 @@ static struct cont { bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(void) +static void cont_flush(enum log_flags flags) { if (cont.flushed) return; if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.facility, cont.level, LOG_NOCONS | flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.flushed = true; } @@ -1386,7 +1389,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len) return false; if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); return false; } @@ -1522,7 +1526,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(); + cont_flush(0); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1540,7 +1544,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(); + cont_flush(0); } if (!stored) @@ -1633,7 +1637,8 @@ EXPORT_SYMBOL(printk); #else -#define LOG_LINE_MAX 0 +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 static struct cont { size_t len; size_t cons; @@ -1938,7 +1943,7 @@ static enum log_flags console_prev; */ void console_unlock(void) { - static char text[LOG_LINE_MAX]; + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; -- cgit v1.2.3 From 96efedf1491cdf0616e5e4fff0711cebf20f69c7 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:29 -0700 Subject: kmsg - avoid warning for CONFIG_PRINTK=n compilations Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d87ca5c6a989..6c3d5bf14da2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -216,6 +216,7 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -228,7 +229,6 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; /* the next printk record to read after the last 'clear' command */ @@ -1635,10 +1635,17 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 +#define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; static struct cont { size_t len; size_t cons; -- cgit v1.2.3 From d39f3d77c9b1fe7cc33a14beb4a4849af0a4ac22 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:30 -0700 Subject: kmsg - export "continuation record" flag to /dev/kmsg In some cases we are forced to store individual records for a continuation line print. Export a flag to allow the external re-construction of the line. The flag allows us to apply a similar logic externally which is used internally when the console, /proc/kmsg or the syslog() output is printed. $ cat /dev/kmsg 4,165,0,-;Free swap = 0kB 4,166,0,-;Total swap = 0kB 6,167,0,c;[ 4,168,0,+;0 4,169,0,+;1 4,170,0,+;2 4,171,0,+;3 4,172,0,+;] 6,173,0,-;[0 1 2 3 ] 6,174,0,-;Console: colour VGA+ 80x25 6,175,0,-;console [tty0] enabled Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6c3d5bf14da2..a41106e19077 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -361,6 +361,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -426,6 +427,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; @@ -463,8 +465,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - (msg->facility << 3) | msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { -- cgit v1.2.3 From eab072609e11a357181806ab5a5c309ef6eb76f5 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:30 -0700 Subject: kmsg - do not flush partial lines when the console is busy Fragments of continuation lines are flushed to the console immediately. In case the console is locked, the fragment must be queued up in the cont buffer. If the the console is busy and the continuation line is complete, but no part of it was written to the console up to this point, we can just store the entire line as a regular record and free the buffer earlier. If the console is busy and earlier messages are already queued up, we should not flush the fragments of continuation lines, but store them after the queued up messages, to ensure the proper ordering. This keeps the console output better readable in case printk()s race against each other, or we receive over-long continuation lines we need to flush. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 93 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a41106e19077..4da2377131b0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -231,6 +231,11 @@ static u32 log_first_idx; static u64 log_next_seq; static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; @@ -1386,6 +1391,7 @@ static struct cont { u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1396,10 +1402,25 @@ static void cont_flush(enum log_flags flags) if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS | flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - - cont.flushed = true; + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } } static bool cont_add(int facility, int level, const char *text, size_t len) @@ -1418,12 +1439,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); + cont.flags = 0; cont.cons = 0; cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + return true; } @@ -1432,7 +1458,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0) { + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1447,7 +1473,8 @@ static size_t cont_print_text(char *text, size_t size) } if (cont.flushed) { - text[textlen++] = '\n'; + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; /* got everything, release buffer */ cont.len = 0; } @@ -1545,7 +1572,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(0); + cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1563,7 +1590,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(0); + cont_flush(LOG_NEWLINE); } if (!stored) @@ -1661,10 +1688,13 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 static u64 syslog_seq; static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; static enum log_flags syslog_prev; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; +static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1948,10 +1978,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1983,19 +2037,7 @@ void console_unlock(void) console_may_schedule = 0; /* flush buffered message fragment immediately to console */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (cont.len && (cont.cons < cont.len || cont.flushed)) { - size_t len; - - len = cont_print_text(text, sizeof(text)); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; @@ -2032,6 +2074,7 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; goto skip; } -- cgit v1.2.3 From b2ad368bebc0f772613668e893fa176396e9094c Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:39 -0700 Subject: tracing: Fix initialization failure path in tracing_set_tracer() If tracer->init() fails, current code will leave current_tracer pointing to an unusable tracer, which at best makes 'current_tracer' report inaccurate value. Fix the issue by pointing current_tracer to nop tracer, and only update current_tracer with the new one after all the initialization succeeds. Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690d..44ee11e31b82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3172,10 +3172,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3195,6 +3195,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 21f679404a0c28bd5b1b3aff2a7218bbff4cb43d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:42 -0700 Subject: tracing/function: Introduce persistent trace option This patch introduces 'func_ptrace' option, now available in /sys/kernel/debug/tracing/options when function tracer is selected. The patch also adds some tiny code that calls back to pstore to record the trace. The callback is no-op when PSTORE=n. Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..13770abd7a12 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,14 +173,12 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, #endif { } /* Always set a last empty entry */ }; @@ -217,6 +230,8 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } + return 0; + } else if (bit == TRACE_FUNC_OPT_PSTORE) { return 0; } -- cgit v1.2.3 From f555f1231a69846d57099760f9c361982600ffa2 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:46 -0700 Subject: tracing/function: Convert func_set_flag() to a switch statement Since the function accepts just one bit, we can use the switch construction instead of if/else if/... Just a cosmetic change, there should be no functional changes. Suggested-by: Steven Rostedt Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 13770abd7a12..a426f410c060 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -217,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -230,12 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; - } else if (bit == TRACE_FUNC_OPT_PSTORE) { - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = -- cgit v1.2.3 From 6575820221f7a4dd6eadecf7bf83cdd154335eda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:26 -0700 Subject: workqueue: perform cpu down operations from low priority cpu_notifier() Currently, all workqueue cpu hotplug operations run off CPU_PRI_WORKQUEUE which is higher than normal notifiers. This is to ensure that workqueue is up and running while bringing up a CPU before other notifiers try to use workqueue on the CPU. Per-cpu workqueues are supposed to remain working and bound to the CPU for normal CPU_DOWN_PREPARE notifiers. This holds mostly true even with workqueue offlining running with higher priority because workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which runs the per-cpu workqueue without concurrency management without explicitly detaching the existing workers. However, if the trustee needs to create new workers, it creates unbound workers which may wander off to other CPUs while CPU_DOWN_PREPARE notifiers are in progress. Furthermore, if the CPU down is cancelled, the per-CPU workqueue may end up with workers which aren't bound to the CPU. While reliably reproducible with a convoluted artificial test-case involving scheduling and flushing CPU burning work items from CPU down notifiers, this isn't very likely to happen in the wild, and, even when it happens, the effects are likely to be hidden by the following successful CPU down. Fix it by using different priorities for up and down notifiers - high priority for up operations and low priority for down operations. Workqueue cpu hotplug operations will soon go through further cleanup. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4fa9e3552f1e..f59b7fd26e26 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3644,6 +3644,41 @@ err_destroy: return NOTIFY_BAD; } +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. + */ +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_POST_DEAD: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + #ifdef CONFIG_SMP struct work_for_cpu { @@ -3839,7 +3874,8 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { -- cgit v1.2.3 From f2d5a0ee06c1813f985bb9386f3ccc0d0315720f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:26 -0700 Subject: workqueue: drop CPU_DYING notifier operation Workqueue used CPU_DYING notification to mark GCWQ_DISASSOCIATED. This was necessary because workqueue's CPU_DOWN_PREPARE happened before other DOWN_PREPARE notifiers and workqueue needed to stay associated across the rest of DOWN_PREPARE. After the previous patch, workqueue's DOWN_PREPARE happens after others and can set GCWQ_DISASSOCIATED directly. Drop CPU_DYING and let the trustee set GCWQ_DISASSOCIATED after disabling concurrency management. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f59b7fd26e26..1405fb98c0b1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1250,11 +1250,11 @@ static void worker_leave_idle(struct worker *worker) * verbatim as it's best effort and blocking and gcwq may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. + * This function tries set_cpus_allowed() and locks gcwq and verifies the + * binding against %GCWQ_DISASSOCIATED which is set during + * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker + * enters idle state or fetches works without dropping lock, it can + * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: * Might sleep. Called without any lock but returns with gcwq->lock @@ -3349,6 +3349,12 @@ static int __cpuinit trustee_thread(void *__gcwq) rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq)); BUG_ON(rc < 0); + /* + * We've claimed all manager positions. Make all workers unbound + * and set DISASSOCIATED. Before this, all workers except for the + * ones which are still executing works from before the last CPU + * down must be on the cpu. After this, they may become diasporas. + */ for_each_worker_pool(pool, gcwq) { pool->flags |= POOL_MANAGING_WORKERS; @@ -3359,6 +3365,8 @@ static int __cpuinit trustee_thread(void *__gcwq) for_each_busy_worker(worker, i, pos, gcwq) worker->flags |= WORKER_ROGUE; + gcwq->flags |= GCWQ_DISASSOCIATED; + /* * Call schedule() so that we cross rq->lock and thus can * guarantee sched callbacks see the rogue flag. This is @@ -3582,16 +3590,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, } break; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; - case CPU_POST_DEAD: gcwq->trustee_state = TRUSTEE_BUTCHER; /* fall through */ @@ -3672,7 +3670,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - case CPU_DYING: case CPU_POST_DEAD: return workqueue_cpu_callback(nfb, action, hcpu); } -- cgit v1.2.3 From 403c821d452c03be4ced571ac91339a9d3631b17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: ROGUE workers are UNBOUND workers Currently, WORKER_UNBOUND is used to mark workers for the unbound global_cwq and WORKER_ROGUE is used to mark workers for disassociated per-cpu global_cwqs. Both are used to make the marked worker skip concurrency management and the only place they make any difference is in worker_enter_idle() where WORKER_ROGUE is used to skip scheduling idle timer, which can easily be replaced with trustee state testing. This patch replaces WORKER_ROGUE with WORKER_UNBOUND and drops WORKER_ROGUE. This is to prepare for removing trustee and handling disassociated global_cwqs as unbound. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1405fb98c0b1..af512927c607 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -58,13 +58,12 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_CPU_INTENSIVE, /* gcwq->trustee_state */ TRUSTEE_START = 0, /* start */ @@ -1198,7 +1197,7 @@ static void worker_enter_idle(struct worker *worker) /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { + if (likely(gcwq->trustee_state != TRUSTEE_DONE)) { if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); @@ -1207,7 +1206,7 @@ static void worker_enter_idle(struct worker *worker) /* * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_ROGUE and zapping nr_running, the + * between setting %WORKER_UNBOUND and zapping nr_running, the * warning may trigger spuriously. Check iff trustee is idle. */ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && @@ -1301,10 +1300,10 @@ __acquires(&gcwq->lock) } /* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. + * Function for worker->rebind_work used to rebind unbound busy workers to + * the associated cpu which is coming back online. This is scheduled by + * cpu up but can race with other cpu hotplug operations and may be + * executed twice without intervening cpu down. */ static void worker_rebind_fn(struct work_struct *work) { @@ -1385,9 +1384,8 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. + * An unbound worker will become a regular one if CPU comes online + * later on. Make sure every worker has PF_THREAD_BOUND set. */ if (bind && !on_unbound_cpu) kthread_bind(worker->task, gcwq->cpu); @@ -3215,11 +3213,10 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. + * This is solved by allowing a gcwq to be detached from CPU, running it + * with unbound workers and allowing it to be reattached later if the cpu + * comes back online. A separate thread is created to govern a gcwq in + * such state and is called the trustee of the gcwq. * * Trustee states and their descriptions. * @@ -3359,19 +3356,18 @@ static int __cpuinit trustee_thread(void *__gcwq) pool->flags |= POOL_MANAGING_WORKERS; list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_ROGUE; + worker->flags |= WORKER_UNBOUND; } for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; + worker->flags |= WORKER_UNBOUND; gcwq->flags |= GCWQ_DISASSOCIATED; /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. + * Call schedule() so that we cross rq->lock and thus can guarantee + * sched callbacks see the unbound flag. This is necessary as + * scheduler callbacks may be invoked from other cpus. */ spin_unlock_irq(&gcwq->lock); schedule(); @@ -3439,7 +3435,7 @@ static int __cpuinit trustee_thread(void *__gcwq) worker = create_worker(pool, false); spin_lock_irq(&gcwq->lock); if (worker) { - worker->flags |= WORKER_ROGUE; + worker->flags |= WORKER_UNBOUND; start_worker(worker); } } @@ -3488,7 +3484,7 @@ static int __cpuinit trustee_thread(void *__gcwq) * rebinding is scheduled. */ worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; + worker->flags &= ~WORKER_UNBOUND; /* queue rebind_work, wq doesn't matter, use the default one */ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, -- cgit v1.2.3 From 6037315269d62bf967286ae2670fdd6b6acedab9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: use mutex for global_cwq manager exclusion POOL_MANAGING_WORKERS is used to ensure that at most one worker takes the manager role at any given time on a given global_cwq. Trustee later hitched on it to assume manager adding blocking wait for the bit. As trustee already needed a custom wait mechanism, waiting for MANAGING_WORKERS was rolled into the same mechanism. Trustee is scheduled to be removed. This patch separates out MANAGING_WORKERS wait into per-pool mutex. Workers use mutex_trylock() to test for manager role and trustee uses mutex_lock() to claim manager roles. gcwq_claim/release_management() helpers are added to grab and release manager roles of all pools on a global_cwq. gcwq_claim_management() always grabs pool manager mutexes in ascending pool index order and uses pool index as lockdep subclass. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 65 ++++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index af512927c607..f7a00697d150 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -51,7 +51,6 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -155,6 +154,7 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ + struct mutex manager_mutex; /* mutex manager should hold */ struct ida worker_ida; /* L: for worker IDs */ struct worker *first_idle; /* L: first idle worker */ }; @@ -644,7 +644,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->flags & POOL_MANAGING_WORKERS; + bool managing = mutex_is_locked(&pool->manager_mutex); int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1655,14 +1655,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool) static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; bool ret = false; - if (pool->flags & POOL_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_mutex)) return ret; pool->flags &= ~POOL_MANAGE_WORKERS; - pool->flags |= POOL_MANAGING_WORKERS; /* * Destroy and then create so that may_start_working() is true @@ -1671,15 +1669,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - pool->flags &= ~POOL_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); - + mutex_unlock(&pool->manager_mutex); return ret; } @@ -3255,6 +3245,24 @@ EXPORT_SYMBOL_GPL(work_busy); * ----------------> RELEASE -------------- */ +/* claim manager positions of all pools */ +static void gcwq_claim_management(struct global_cwq *gcwq) +{ + struct worker_pool *pool; + + for_each_worker_pool(pool, gcwq) + mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); +} + +/* release manager positions */ +static void gcwq_release_management(struct global_cwq *gcwq) +{ + struct worker_pool *pool; + + for_each_worker_pool(pool, gcwq) + mutex_unlock(&pool->manager_mutex); +} + /** * trustee_wait_event_timeout - timed event wait for trustee * @cond: condition to wait for @@ -3304,16 +3312,6 @@ EXPORT_SYMBOL_GPL(work_busy); __ret1 < 0 ? -1 : 0; \ }) -static bool gcwq_is_managing_workers(struct global_cwq *gcwq) -{ - struct worker_pool *pool; - - for_each_worker_pool(pool, gcwq) - if (pool->flags & POOL_MANAGING_WORKERS) - return true; - return false; -} - static bool gcwq_has_idle_workers(struct global_cwq *gcwq) { struct worker_pool *pool; @@ -3336,15 +3334,8 @@ static int __cpuinit trustee_thread(void *__gcwq) BUG_ON(gcwq->cpu != smp_processor_id()); + gcwq_claim_management(gcwq); spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq)); - BUG_ON(rc < 0); /* * We've claimed all manager positions. Make all workers unbound @@ -3352,12 +3343,9 @@ static int __cpuinit trustee_thread(void *__gcwq) * ones which are still executing works from before the last CPU * down must be on the cpu. After this, they may become diasporas. */ - for_each_worker_pool(pool, gcwq) { - pool->flags |= POOL_MANAGING_WORKERS; - + for_each_worker_pool(pool, gcwq) list_for_each_entry(worker, &pool->idle_list, entry) worker->flags |= WORKER_UNBOUND; - } for_each_busy_worker(worker, i, pos, gcwq) worker->flags |= WORKER_UNBOUND; @@ -3497,9 +3485,7 @@ static int __cpuinit trustee_thread(void *__gcwq) work_color_to_flags(WORK_NO_COLOR)); } - /* relinquish manager role */ - for_each_worker_pool(pool, gcwq) - pool->flags &= ~POOL_MANAGING_WORKERS; + gcwq_release_management(gcwq); /* notify completion */ gcwq->trustee = NULL; @@ -3894,6 +3880,7 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, (unsigned long)pool); + mutex_init(&pool->manager_mutex); ida_init(&pool->worker_ida); } -- cgit v1.2.3 From bc2ae0f5bb2f39e6db06a62f9d353e4601a332a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: drop @bind from create_worker() Currently, create_worker()'s callers are responsible for deciding whether the newly created worker should be bound to the associated CPU and create_worker() sets WORKER_UNBOUND only for the workers for the unbound global_cwq. Creation during normal operation is always via maybe_create_worker() and @bind is true. For workers created during hotplug, @bind is false. Normal operation path is planned to be used even while the CPU is going through hotplug operations or offline and this static decision won't work. Drop @bind from create_worker() and decide whether to bind by looking at GCWQ_DISASSOCIATED. create_worker() will also set WORKER_UNBOUND autmatically if disassociated. To avoid flipping GCWQ_DISASSOCIATED while create_worker() is in progress, the flag is now allowed to be changed only while holding all manager_mutexes on the global_cwq. This requires that GCWQ_DISASSOCIATED is not cleared behind trustee's back. CPU_ONLINE no longer clears DISASSOCIATED before flushing trustee, which clears DISASSOCIATED before rebinding remaining workers if asked to release. For cases where trustee isn't around, CPU_ONLINE clears DISASSOCIATED after flushing trustee. Also, now, first_idle has UNBOUND set on creation which is explicitly cleared by CPU_ONLINE while binding it. These convolutions will soon be removed by further simplification of CPU hotplug path. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 64 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7a00697d150..e1d05e51a80a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,7 +45,22 @@ #include "workqueue_sched.h" enum { - /* global_cwq flags */ + /* + * global_cwq flags + * + * A bound gcwq is either associated or disassociated with its CPU. + * While associated (!DISASSOCIATED), all workers are bound to the + * CPU and none has %WORKER_UNBOUND set and concurrency management + * is in effect. + * + * While DISASSOCIATED, the cpu may be offline and all workers have + * %WORKER_UNBOUND set and concurrency management disabled, and may + * be executing on any CPU. The gcwq behaves as an unbound one. + * + * Note that DISASSOCIATED can be flipped only while holding + * managership of all pools on the gcwq to avoid changing binding + * state while create_worker() is in progress. + */ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ GCWQ_FREEZING = 1 << 1, /* freeze in progress */ @@ -1334,7 +1349,6 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to - * @bind: whether to set affinity to @cpu or not * * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using @@ -1346,10 +1360,9 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct worker_pool *pool, bool bind) +static struct worker *create_worker(struct worker_pool *pool) { struct global_cwq *gcwq = pool->gcwq; - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1370,7 +1383,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) worker->pool = pool; worker->id = id; - if (!on_unbound_cpu) + if (gcwq->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(gcwq->cpu), "kworker/%u:%d%s", gcwq->cpu, id, pri); @@ -1384,15 +1397,19 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); /* - * An unbound worker will become a regular one if CPU comes online - * later on. Make sure every worker has PF_THREAD_BOUND set. + * Determine CPU binding of the new worker depending on + * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * flag remains stable across this function. See the comments + * above the flag definition for details. + * + * As an unbound worker may later become a regular one if CPU comes + * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (bind && !on_unbound_cpu) + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); - else { + } else { worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; + worker->flags |= WORKER_UNBOUND; } return worker; @@ -1568,7 +1585,7 @@ restart: while (true) { struct worker *worker; - worker = create_worker(pool, true); + worker = create_worker(pool); if (worker) { del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); @@ -3420,12 +3437,10 @@ static int __cpuinit trustee_thread(void *__gcwq) if (need_to_create_worker(pool)) { spin_unlock_irq(&gcwq->lock); - worker = create_worker(pool, false); + worker = create_worker(pool); spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_UNBOUND; + if (worker) start_worker(worker); - } } } @@ -3463,6 +3478,10 @@ static int __cpuinit trustee_thread(void *__gcwq) for_each_worker_pool(pool, gcwq) WARN_ON(!list_empty(&pool->idle_list)); + /* if we're reassociating, clear DISASSOCIATED */ + if (gcwq->trustee_state == TRUSTEE_RELEASE) + gcwq->flags &= ~GCWQ_DISASSOCIATED; + for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; @@ -3546,7 +3565,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, i = 0; for_each_worker_pool(pool, gcwq) { BUG_ON(pool->first_idle); - new_workers[i] = create_worker(pool, false); + new_workers[i] = create_worker(pool); if (!new_workers[i++]) goto err_destroy; } @@ -3584,13 +3603,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq->flags &= ~GCWQ_DISASSOCIATED; if (gcwq->trustee_state != TRUSTEE_DONE) { gcwq->trustee_state = TRUSTEE_RELEASE; wake_up_process(gcwq->trustee); wait_trustee_state(gcwq, TRUSTEE_DONE); } + /* + * Either DISASSOCIATED is already cleared or no worker is + * left on the gcwq. Safe to clear DISASSOCIATED without + * claiming managers. + */ + gcwq->flags &= ~GCWQ_DISASSOCIATED; + /* * Trustee is done and there might be no worker left. * Put the first_idle in and request a real manager to @@ -3601,6 +3626,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, kthread_bind(pool->first_idle->task, cpu); spin_lock_irq(&gcwq->lock); pool->flags |= POOL_MANAGE_WORKERS; + pool->first_idle->flags &= ~WORKER_UNBOUND; start_worker(pool->first_idle); pool->first_idle = NULL; } @@ -3899,7 +3925,7 @@ static int __init init_workqueues(void) for_each_worker_pool(pool, gcwq) { struct worker *worker; - worker = create_worker(pool, true); + worker = create_worker(pool); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); start_worker(worker); -- cgit v1.2.3 From 25511a477657884d2164f338341fa89652610507 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: reimplement CPU online rebinding to handle idle workers Currently, if there are left workers when a CPU is being brough back online, the trustee kills all idle workers and scheduled rebind_work so that they re-bind to the CPU after the currently executing work is finished. This works for busy workers because concurrency management doesn't try to wake up them from scheduler callbacks, which require the target task to be on the local run queue. The busy worker bumps concurrency counter appropriately as it clears WORKER_UNBOUND from the rebind work item and it's bound to the CPU before returning to the idle state. To reduce CPU on/offlining overhead (as many embedded systems use it for powersaving) and simplify the code path, workqueue is planned to be modified to retain idle workers across CPU on/offlining. This patch reimplements CPU online rebinding such that it can also handle idle workers. As noted earlier, due to the local wakeup requirement, rebinding idle workers is tricky. All idle workers must be re-bound before scheduler callbacks are enabled. This is achieved by interlocking idle re-binding. Idle workers are requested to re-bind and then hold until all idle re-binding is complete so that no bound worker starts executing work item. Only after all idle workers are re-bound and parked, CPU_ONLINE proceeds to release them and queue rebind work item to busy workers thus guaranteeing scheduler callbacks aren't invoked until all idle workers are ready. worker_rebind_fn() is renamed to busy_worker_rebind_fn() and idle_worker_rebind() for idle workers is added. Rebinding logic is moved to rebind_workers() and now called from CPU_ONLINE after flushing trustee. While at it, add CPU sanity check in worker_thread(). Note that now a worker may become idle or the manager between trustee release and rebinding during CPU_ONLINE. As the previous patch updated create_worker() so that it can be used by regular manager while unbound and this patch implements idle re-binding, this is safe. This prepares for removal of trustee and keeping idle workers across CPU hotplugs. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 215 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 166 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1d05e51a80a..6927fecae412 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -133,6 +133,7 @@ enum { struct global_cwq; struct worker_pool; +struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -154,7 +155,10 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + + /* for rebinding worker to CPU */ + struct idle_rebind *idle_rebind; /* L: for idle worker */ + struct work_struct rebind_work; /* L: for busy worker */ }; struct worker_pool { @@ -190,6 +194,8 @@ struct global_cwq { struct worker_pool pools[2]; /* normal and highpri pools */ + wait_queue_head_t rebind_hold; /* rebind hold wait */ + struct task_struct *trustee; /* L: for gcwq shutdown */ unsigned int trustee_state; /* L: trustee state */ wait_queue_head_t trustee_wait; /* trustee wait */ @@ -1314,13 +1320,37 @@ __acquires(&gcwq->lock) } } +struct idle_rebind { + int cnt; /* # workers to be rebound */ + struct completion done; /* all workers rebound */ +}; + +/* + * Rebind an idle @worker to its CPU. During CPU onlining, this has to + * happen synchronously for idle workers. worker_thread() will test + * %WORKER_REBIND before leaving idle and call this function. + */ +static void idle_worker_rebind(struct worker *worker) +{ + struct global_cwq *gcwq = worker->pool->gcwq; + + /* CPU must be online at this point */ + WARN_ON(!worker_maybe_bind_and_lock(worker)); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); + + /* we did our part, wait for rebind_workers() to finish up */ + wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); +} + /* - * Function for worker->rebind_work used to rebind unbound busy workers to + * Function for @worker->rebind.work used to rebind unbound busy workers to * the associated cpu which is coming back online. This is scheduled by * cpu up but can race with other cpu hotplug operations and may be * executed twice without intervening cpu down. */ -static void worker_rebind_fn(struct work_struct *work) +static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; @@ -1331,6 +1361,112 @@ static void worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&gcwq->lock); } +/** + * rebind_workers - rebind all workers of a gcwq to the associated CPU + * @gcwq: gcwq of interest + * + * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * The idle ones should be rebound synchronously and idle rebinding should + * be complete before any worker starts executing work items with + * concurrency management enabled; otherwise, scheduler may oops trying to + * wake up non-local idle worker from wq_worker_sleeping(). + * + * This is achieved by repeatedly requesting rebinding until all idle + * workers are known to have been rebound under @gcwq->lock and holding all + * idle workers from becoming busy until idle rebinding is complete. + * + * Once idle workers are rebound, busy workers can be rebound as they + * finish executing their current work items. Queueing the rebind work at + * the head of their scheduled lists is enough. Note that nr_running will + * be properbly bumped as busy workers rebind. + * + * On return, all workers are guaranteed to either be bound or have rebind + * work item scheduled. + */ +static void rebind_workers(struct global_cwq *gcwq) + __releases(&gcwq->lock) __acquires(&gcwq->lock) +{ + struct idle_rebind idle_rebind; + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; + + lockdep_assert_held(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) + lockdep_assert_held(&pool->manager_mutex); + + /* + * Rebind idle workers. Interlocked both ways. We wait for + * workers to rebind via @idle_rebind.done. Workers will wait for + * us to finish up by watching %WORKER_REBIND. + */ + init_completion(&idle_rebind.done); +retry: + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + /* set REBIND and kick idle ones, we'll wait for these later */ + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { + if (worker->flags & WORKER_REBIND) + continue; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + idle_rebind.cnt++; + worker->idle_rebind = &idle_rebind; + + /* worker_thread() will call idle_worker_rebind() */ + wake_up_process(worker->task); + } + } + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + /* busy ones might have become idle while waiting, retry */ + goto retry; + } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + /* wq doesn't matter, use the default one */ + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1339,7 +1475,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); + INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1829,6 +1965,9 @@ __acquires(&gcwq->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif + WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + raw_smp_processor_id() != gcwq->cpu); + /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is @@ -1946,11 +2085,20 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&gcwq->lock); - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { + /* + * DIE can be set only while idle and REBIND set while busy has + * @worker->rebind_work scheduled. Checking here is enough. + */ + if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; + + if (worker->flags & WORKER_DIE) { + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + idle_worker_rebind(worker); + goto woke_up; } worker_leave_idle(worker); @@ -3468,42 +3616,6 @@ static int __cpuinit trustee_thread(void *__gcwq) } } while (i && rc >= 0); - /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. - */ - for_each_worker_pool(pool, gcwq) - WARN_ON(!list_empty(&pool->idle_list)); - - /* if we're reassociating, clear DISASSOCIATED */ - if (gcwq->trustee_state == TRUSTEE_RELEASE) - gcwq->flags &= ~GCWQ_DISASSOCIATED; - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_UNBOUND; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - gcwq_release_management(gcwq); /* notify completion */ @@ -3609,13 +3721,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, wait_trustee_state(gcwq, TRUSTEE_DONE); } - /* - * Either DISASSOCIATED is already cleared or no worker is - * left on the gcwq. Safe to clear DISASSOCIATED without - * claiming managers. - */ + spin_unlock_irq(&gcwq->lock); + gcwq_claim_management(gcwq); + spin_lock_irq(&gcwq->lock); + gcwq->flags &= ~GCWQ_DISASSOCIATED; + rebind_workers(gcwq); + + gcwq_release_management(gcwq); + /* * Trustee is done and there might be no worker left. * Put the first_idle in and request a real manager to @@ -3910,6 +4025,8 @@ static int __init init_workqueues(void) ida_init(&pool->worker_ida); } + init_waitqueue_head(&gcwq->rebind_hold); + gcwq->trustee_state = TRUSTEE_DONE; init_waitqueue_head(&gcwq->trustee_wait); } -- cgit v1.2.3 From 3ce63377305b694f53e7dd0c72907591c5344224 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: don't butcher idle workers on an offline CPU Currently, during CPU offlining, after all pending work items are drained, the trustee butchers all workers. Also, on CPU onlining failure, workqueue_cpu_callback() ensures that the first idle worker is destroyed. Combined, these guarantee that an offline CPU doesn't have any worker for it once all the lingering work items are finished. This guarantee isn't really necessary and makes CPU on/offlining more expensive than needs to be, especially for platforms which use CPU hotplug for powersaving. This patch lets offline CPUs removes idle worker butchering from the trustee and let a CPU which failed onlining keep the created first worker. The first worker is created if the CPU doesn't have any during CPU_DOWN_PREPARE and started right away. If onlining succeeds, the rebind_workers() call in CPU_ONLINE will rebind it like any other workers. If onlining fails, the worker is left alone till the next try. This makes CPU hotplugs cheaper by allowing global_cwqs to keep workers across them and simplifies code. Note that trustee doesn't re-arm idle timer when it's done and thus the disassociated global_cwq will keep all workers until it comes back online. This will be improved by further patches. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 94 ++++++++---------------------------------------------- 1 file changed, 14 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6927fecae412..acfabb22e2c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -175,7 +175,6 @@ struct worker_pool { struct mutex manager_mutex; /* mutex manager should hold */ struct ida worker_ida; /* L: for worker IDs */ - struct worker *first_idle; /* L: first idle worker */ }; /* @@ -3477,16 +3476,6 @@ static void gcwq_release_management(struct global_cwq *gcwq) __ret1 < 0 ? -1 : 0; \ }) -static bool gcwq_has_idle_workers(struct global_cwq *gcwq) -{ - struct worker_pool *pool; - - for_each_worker_pool(pool, gcwq) - if (!list_empty(&pool->idle_list)) - return true; - return false; -} - static int __cpuinit trustee_thread(void *__gcwq) { struct global_cwq *gcwq = __gcwq; @@ -3494,7 +3483,6 @@ static int __cpuinit trustee_thread(void *__gcwq) struct worker *worker; struct work_struct *work; struct hlist_node *pos; - long rc; int i; BUG_ON(gcwq->cpu != smp_processor_id()); @@ -3597,25 +3585,6 @@ static int __cpuinit trustee_thread(void *__gcwq) break; } - /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. - */ - do { - rc = trustee_wait_event(gcwq_has_idle_workers(gcwq)); - - i = 0; - for_each_worker_pool(pool, gcwq) { - while (!list_empty(&pool->idle_list)) { - worker = list_first_entry(&pool->idle_list, - struct worker, entry); - destroy_worker(worker); - } - i |= pool->nr_workers; - } - } while (i && rc >= 0); - gcwq_release_management(gcwq); /* notify completion */ @@ -3658,10 +3627,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); struct task_struct *new_trustee = NULL; - struct worker *new_workers[NR_WORKER_POOLS] = { }; struct worker_pool *pool; unsigned long flags; - int i; action &= ~CPU_TASKS_FROZEN; @@ -3672,14 +3639,22 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, if (IS_ERR(new_trustee)) return notifier_from_errno(PTR_ERR(new_trustee)); kthread_bind(new_trustee, cpu); - /* fall through */ + break; + case CPU_UP_PREPARE: - i = 0; for_each_worker_pool(pool, gcwq) { - BUG_ON(pool->first_idle); - new_workers[i] = create_worker(pool); - if (!new_workers[i++]) - goto err_destroy; + struct worker *worker; + + if (pool->nr_workers) + continue; + + worker = create_worker(pool); + if (!worker) + return NOTIFY_BAD; + + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); } } @@ -3694,23 +3669,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, gcwq->trustee_state = TRUSTEE_START; wake_up_process(gcwq->trustee); wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - i = 0; - for_each_worker_pool(pool, gcwq) { - BUG_ON(pool->first_idle); - pool->first_idle = new_workers[i++]; - } break; case CPU_POST_DEAD: gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - for_each_worker_pool(pool, gcwq) { - destroy_worker(pool->first_idle); - pool->first_idle = NULL; - } break; case CPU_DOWN_FAILED: @@ -3730,39 +3692,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, rebind_workers(gcwq); gcwq_release_management(gcwq); - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - for_each_worker_pool(pool, gcwq) { - spin_unlock_irq(&gcwq->lock); - kthread_bind(pool->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - pool->flags |= POOL_MANAGE_WORKERS; - pool->first_idle->flags &= ~WORKER_UNBOUND; - start_worker(pool->first_idle); - pool->first_idle = NULL; - } break; } spin_unlock_irqrestore(&gcwq->lock, flags); return notifier_from_errno(0); - -err_destroy: - if (new_trustee) - kthread_stop(new_trustee); - - spin_lock_irqsave(&gcwq->lock, flags); - for (i = 0; i < NR_WORKER_POOLS; i++) - if (new_workers[i]) - destroy_worker(new_workers[i]); - spin_unlock_irqrestore(&gcwq->lock, flags); - - return NOTIFY_BAD; } /* @@ -3775,7 +3710,6 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_CANCELED: case CPU_DOWN_FAILED: case CPU_ONLINE: return workqueue_cpu_callback(nfb, action, hcpu); -- cgit v1.2.3 From 628c78e7ea19d5b70d2b6a59030362168cdbe1ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:27 -0700 Subject: workqueue: remove CPU offline trustee With the previous changes, a disassociated global_cwq now can run as an unbound one on its own - it can create workers as necessary to drain remaining works after the CPU has been brought down and manage the number of workers using the usual idle timer mechanism making trustee completely redundant except for the actual unbinding operation. This patch removes the trustee and let a disassociated global_cwq manage itself. Unbinding is moved to a work item (for CPU affinity) which is scheduled and flushed from CPU_DONW_PREPARE. This patch moves nr_running clearing outside gcwq and manager locks to simplify the code. As nr_running is unused at the point, this is safe. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 288 +++++++---------------------------------------------- 1 file changed, 36 insertions(+), 252 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acfabb22e2c4..d1545daa74ad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -79,13 +79,6 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ - NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ @@ -100,7 +93,6 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by @@ -194,10 +186,6 @@ struct global_cwq { struct worker_pool pools[2]; /* normal and highpri pools */ wait_queue_head_t rebind_hold; /* rebind hold wait */ - - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ } ____cacheline_aligned_in_smp; /* @@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * worklist not empty test sequence is in insert_work(). * Please read comment there. * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. + * NOT_RUNNING is clear. This means that we're bound to and + * running on the local cpu w/ rq lock held and preemption + * disabled, which in turn means that none else could be + * manipulating idle_list, so dereferencing idle_list without gcwq + * lock is safe. */ if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) to_wakeup = first_worker(pool); @@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker) /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); - if (likely(gcwq->trustee_state != TRUSTEE_DONE)) { - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_UNBOUND and zapping nr_running, the - * warning may trigger spuriously. Check iff trustee is idle. + * Sanity check nr_running. Because gcwq_unbind_fn() releases + * gcwq->lock between setting %WORKER_UNBOUND and zapping + * nr_running, the warning may trigger spuriously. Check iff + * unbind is not in progress. */ - WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && atomic_read(get_pool_nr_running(pool))); } @@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running it - * with unbound workers and allowing it to be reattached later if the cpu - * comes back online. A separate thread is created to govern a gcwq in - * such state and is called the trustee of the gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- + * This is solved by allowing a gcwq to be disassociated from the CPU + * running as an unbound one and allowing it to be reattached later if the + * cpu comes back online. */ /* claim manager positions of all pools */ @@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq) mutex_unlock(&pool->manager_mutex); } -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) - -static int __cpuinit trustee_thread(void *__gcwq) +static void gcwq_unbind_fn(struct work_struct *work) { - struct global_cwq *gcwq = __gcwq; + struct global_cwq *gcwq = get_gcwq(smp_processor_id()); struct worker_pool *pool; struct worker *worker; - struct work_struct *work; struct hlist_node *pos; int i; @@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq) gcwq->flags |= GCWQ_DISASSOCIATED; + spin_unlock_irq(&gcwq->lock); + gcwq_release_management(gcwq); + /* * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the unbound flag. This is necessary as - * scheduler callbacks may be invoked from other cpus. + * sched callbacks see the %WORKER_UNBOUND flag. This is necessary + * as scheduler callbacks may be invoked from other cpus. */ - spin_unlock_irq(&gcwq->lock); schedule(); - spin_lock_irq(&gcwq->lock); /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. + * Sched callbacks are disabled now. Zap nr_running. After this, + * nr_running stays zero and need_more_worker() and keep_working() + * are always true as long as the worklist is not empty. @gcwq now + * behaves as unbound (in terms of concurrency management) gcwq + * which is served by workers tied to the CPU. + * + * On return from this function, the current worker would trigger + * unbound chain execution of pending work items if other workers + * didn't already. */ for_each_worker_pool(pool, gcwq) atomic_set(get_pool_nr_running(pool), 0); - - spin_unlock_irq(&gcwq->lock); - for_each_worker_pool(pool, gcwq) - del_timer_sync(&pool->idle_timer); - spin_lock_irq(&gcwq->lock); - - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); - - /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. - */ - while (true) { - bool busy = false; - - for_each_worker_pool(pool, gcwq) - busy |= pool->nr_workers != pool->nr_idle; - - if (!busy && !(gcwq->flags & GCWQ_FREEZING) && - gcwq->trustee_state != TRUSTEE_IN_CHARGE) - break; - - for_each_worker_pool(pool, gcwq) { - int nr_works = 0; - - list_for_each_entry(work, &pool->worklist, entry) { - send_mayday(work); - nr_works++; - } - - list_for_each_entry(worker, &pool->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } - - if (need_to_create_worker(pool)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(pool); - spin_lock_irq(&gcwq->lock); - if (worker) - start_worker(worker); - } - } - - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } - - gcwq_release_management(gcwq); - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; -} - -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. - */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } } static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, @@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; struct worker_pool *pool; + struct work_struct unbind_work; unsigned long flags; action &= ~CPU_TASKS_FROZEN; switch (action) { case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + schedule_work_on(cpu, &unbind_work); + flush_work(&unbind_work); break; case CPU_UP_PREPARE: @@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, spin_lock_irqsave(&gcwq->lock, flags); switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - break; - - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - break; - case CPU_DOWN_FAILED: case CPU_ONLINE: - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - spin_unlock_irq(&gcwq->lock); gcwq_claim_management(gcwq); spin_lock_irq(&gcwq->lock); @@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - case CPU_POST_DEAD: return workqueue_cpu_callback(nfb, action, hcpu); } return NOTIFY_OK; @@ -3960,9 +3747,6 @@ static int __init init_workqueues(void) } init_waitqueue_head(&gcwq->rebind_hold); - - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); } /* create the initial worker */ -- cgit v1.2.3 From 8db25e7891a47e03db6f04344a9c92be16e391bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:28 -0700 Subject: workqueue: simplify CPU hotplug code With trustee gone, CPU hotplug code can be simplified. * gcwq_claim/release_management() now grab and release gcwq lock too respectively and gained _and_lock and _and_unlock postfixes. * All CPU hotplug logic was implemented in workqueue_cpu_callback() which was called by workqueue_cpu_up/down_callback() for the correct priority. This was because up and down paths shared a lot of logic, which is no longer true. Remove workqueue_cpu_callback() and move all hotplug logic into the two actual callbacks. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" --- kernel/workqueue.c | 79 +++++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d1545daa74ad..471996a81633 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3358,19 +3358,21 @@ EXPORT_SYMBOL_GPL(work_busy); */ /* claim manager positions of all pools */ -static void gcwq_claim_management(struct global_cwq *gcwq) +static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) { struct worker_pool *pool; for_each_worker_pool(pool, gcwq) mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); + spin_lock_irq(&gcwq->lock); } /* release manager positions */ -static void gcwq_release_management(struct global_cwq *gcwq) +static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; + spin_unlock_irq(&gcwq->lock); for_each_worker_pool(pool, gcwq) mutex_unlock(&pool->manager_mutex); } @@ -3385,8 +3387,7 @@ static void gcwq_unbind_fn(struct work_struct *work) BUG_ON(gcwq->cpu != smp_processor_id()); - gcwq_claim_management(gcwq); - spin_lock_irq(&gcwq->lock); + gcwq_claim_management_and_lock(gcwq); /* * We've claimed all manager positions. Make all workers unbound @@ -3403,8 +3404,7 @@ static void gcwq_unbind_fn(struct work_struct *work) gcwq->flags |= GCWQ_DISASSOCIATED; - spin_unlock_irq(&gcwq->lock); - gcwq_release_management(gcwq); + gcwq_release_management_and_unlock(gcwq); /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3428,26 +3428,19 @@ static void gcwq_unbind_fn(struct work_struct *work) atomic_set(get_pool_nr_running(pool), 0); } -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. + */ +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; - struct work_struct unbind_work; - unsigned long flags; - - action &= ~CPU_TASKS_FROZEN; - - switch (action) { - case CPU_DOWN_PREPARE: - /* unbinding should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); - schedule_work_on(cpu, &unbind_work); - flush_work(&unbind_work); - break; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: for_each_worker_pool(pool, gcwq) { struct worker *worker; @@ -3463,45 +3456,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, start_worker(worker); spin_unlock_irq(&gcwq->lock); } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + break; - switch (action) { case CPU_DOWN_FAILED: case CPU_ONLINE: - spin_unlock_irq(&gcwq->lock); - gcwq_claim_management(gcwq); - spin_lock_irq(&gcwq->lock); - + gcwq_claim_management_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; - rebind_workers(gcwq); - - gcwq_release_management(gcwq); + gcwq_release_management_and_unlock(gcwq); break; } - - spin_unlock_irqrestore(&gcwq->lock, flags); - - return notifier_from_errno(0); -} - -/* - * Workqueues should be brought up before normal priority CPU notifiers. - * This will be registered high priority CPU notifier. - */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: - case CPU_ONLINE: - return workqueue_cpu_callback(nfb, action, hcpu); - } return NOTIFY_OK; } @@ -3513,9 +3477,16 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - return workqueue_cpu_callback(nfb, action, hcpu); + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + schedule_work_on(cpu, &unbind_work); + flush_work(&unbind_work); + break; } return NOTIFY_OK; } -- cgit v1.2.3 From 11388c87d2abca1f01975ced28ce9eacea239104 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2012 00:00:58 +0200 Subject: PM / Sleep: Require CAP_BLOCK_SUSPEND to use wake_lock/wake_unlock Require processes wanting to use the wake_lock/wake_unlock sysfs files to have the CAP_BLOCK_SUSPEND capability, which also is required for the eventpoll EPOLLWAKEUP flag to be effective, so that all interfaces related to blocking autosleep depend on the same capability. Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org Acked-by: Michael Kerrisk --- kernel/power/wakelock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index c8fba3380076..8f50de394d22 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -9,6 +9,7 @@ * manipulate wakelocks on Android. */ +#include #include #include #include @@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + while (*str && !isspace(*str)) str++; @@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + len = strlen(buf); if (!len) return -EINVAL; -- cgit v1.2.3 From eea03c20ae38a55405c0865ed9adfccc400e4c8e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Jul 2012 18:15:46 -0700 Subject: Make wait_for_device_probe() also do scsi_complete_async_scans() Commit a7a20d103994 ("sd: limit the scope of the async probe domain") make the SCSI device probing run device discovery in it's own async domain. However, as a result, the partition detection was no longer synchronized by async_synchronize_full() (which, despite the name, only synchronizes the global async space, not all of them). Which in turn meant that "wait_for_device_probe()" would not wait for the SCSI partitions to be parsed. And "wait_for_device_probe()" was what the boot time init code relied on for mounting the root filesystem. Now, most people never noticed this, because not only is it timing-dependent, but modern distributions all use initrd. So the root filesystem isn't actually on a disk at all. And then before they actually mount the final disk filesystem, they will have loaded the scsi-wait-scan module, which not only does the expected wait_for_device_probe(), but also does scsi_complete_async_scans(). [ Side note: scsi_complete_async_scans() had also been partially broken, but that was fixed in commit 43a8d39d0137 ("fix async probe regression"), so that same commit a7a20d103994 had actually broken setups even if you used scsi-wait-scan explicitly ] Solve this problem by just moving the scsi_complete_async_scans() call into wait_for_device_probe(). Everybody who wants to wait for device probing to finish really wants the SCSI probing to complete, so there's no reason not to do this. So now "wait_for_device_probe()" really does what the name implies, and properly waits for device probing to finish. This also removes the now unnecessary extra calls to scsi_complete_async_scans(). Reported-and-tested-by: Artem S. Tashkinov Cc: Dan Williams Cc: Alan Stern Cc: James Bottomley Cc: Borislav Petkov Cc: linux-scsi Signed-off-by: Linus Torvalds --- kernel/power/hibernate.c | 8 -------- kernel/power/user.c | 2 -- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -27,7 +27,6 @@ #include #include #include -#include #include "power.h" @@ -748,13 +747,6 @@ static int software_resume(void) async_synchronize_full(); } - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { error = -ENODEV; diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,7 +24,6 @@ #include #include #include -#include #include @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) * appear. */ wait_for_device_probe(); - scsi_complete_async_scans(); data->swap = -1; data->mode = O_WRONLY; -- cgit v1.2.3 From 6791457a090d9a234a40b501c2536f0aefaeae4b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Jul 2012 13:18:12 -0400 Subject: printk: Export struct log size and member offsets through vmcoreinfo There are tools like makedumpfile and vmcore-dmesg which can extract kernel log buffer from vmcore. Since we introduced structured logging, that functionality is broken. Now user space tools need to know about "struct log" and offsets of various fields to be able to parse struct log data and extract text message or dictonary. This patch exports some of the fields. Currently I am not exporting log "level" info as that is a bitfield and offsetof() bitfields can't be calculated. But if people start asking for log level info in the output then we probably either need to seprate out "level" or use bit shift operations for flags and level. Signed-off-by: Vivek Goyal Acked-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 4da2377131b0..449364f07a1e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -671,6 +671,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif -- cgit v1.2.3 From 2955b47d2c1983998a8c5915cb96884e67f7cb53 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 Jul 2012 19:33:25 -0700 Subject: [SCSI] async: introduce 'async_domain' type This is in preparation for teaching async_synchronize_full() to sync all pending async work, and not just on the async_running domain. This conversion is functionally equivalent, just embedding the existing list in a new async_domain type. The .registered attribute is used in a later patch to distinguish between domains that want to be flushed by async_synchronize_full() versus those that only expect async_synchronize_{full|cookie}_domain to be used for flushing. [jejb: add async.h to scsi_priv.h for struct async_domain] Signed-off-by: Dan Williams Acked-by: Arjan van de Ven Acked-by: Mark Brown Tested-by: Eldad Zack Signed-off-by: James Bottomley --- kernel/async.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index bd0c168a3bbe..ba5491dfa991 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,7 +62,7 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); static DEFINE_SPINLOCK(async_lock); struct async_entry { @@ -71,7 +71,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,13 +82,12 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -99,7 +98,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -119,10 +118,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -156,7 +156,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -238,20 +238,20 @@ void async_synchronize_full(void) { do { async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); + } while (!list_empty(&async_running.domain) || !list_empty(&async_pending)); } EXPORT_SYMBOL_GPL(async_synchronize_full); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -261,11 +261,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; -- cgit v1.2.3 From a4683487f90bfe3049686fc5c566bdc1ad03ace6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 Jul 2012 19:33:30 -0700 Subject: [SCSI] async: make async_synchronize_full() flush all work regardless of domain In response to an async related regression James noted: "My theory is that this is an init problem: The assumption in a lot of our code is that async_synchronize_full() waits for everything ... even the domain specific async schedules, which isn't true." ...so make this assumption true. Each domain, including the default one, registers itself on a global domain list when work is scheduled. Once all entries complete it exits that list. Waiting for the list to be empty syncs all in-flight work across all domains. Domains can opt-out of global syncing if they are declared as exclusive ASYNC_DOMAIN_EXCLUSIVE(). All stack-based domains have been declared exclusive since the domain may go out of scope as soon as the last work item completes. Statically declared domains are mostly ok, but async_unregister_domain() is there to close any theoretical races with pending async_synchronize_full waiters at module removal time. Signed-off-by: Dan Williams Acked-by: Arjan van de Ven Reported-by: Meelis Roos Reported-by: Eldad Zack Tested-by: Eldad Zack Signed-off-by: James Bottomley --- kernel/async.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index ba5491dfa991..9d3118384858 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -63,7 +63,9 @@ static async_cookie_t next_cookie = 1; static LIST_HEAD(async_pending); static ASYNC_DOMAIN(async_running); +static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); +static DEFINE_MUTEX(async_register_mutex); struct async_entry { struct list_head list; @@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); + if (running->registered && --running->count == 0) + list_del_init(&running->node); /* 4) free the entry */ kfree(entry); @@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->list, &async_pending); + if (running->registered && running->count++ == 0) + list_add_tail(&running->node, &async_domains); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -236,12 +242,42 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { + mutex_lock(&async_register_mutex); do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running.domain) || !list_empty(&async_pending)); + struct async_domain *domain = NULL; + + spin_lock_irq(&async_lock); + if (!list_empty(&async_domains)) + domain = list_first_entry(&async_domains, typeof(*domain), node); + spin_unlock_irq(&async_lock); + + async_synchronize_cookie_domain(next_cookie, domain); + } while (!list_empty(&async_domains)); + mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_synchronize_full); +/** + * async_unregister_domain - ensure no more anonymous waiters on this domain + * @domain: idle domain to flush out of any async_synchronize_full instances + * + * async_synchronize_{cookie|full}_domain() are not flushed since callers + * of these routines should know the lifetime of @domain + * + * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing + */ +void async_unregister_domain(struct async_domain *domain) +{ + mutex_lock(&async_register_mutex); + spin_lock_irq(&async_lock); + WARN_ON(!domain->registered || !list_empty(&domain->node) || + !list_empty(&domain->domain)); + domain->registered = 0; + spin_unlock_irq(&async_lock); + mutex_unlock(&async_register_mutex); +} +EXPORT_SYMBOL_GPL(async_unregister_domain); + /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @domain: running list to synchronize on @@ -268,6 +304,9 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; + if (!running) + return; + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); -- cgit v1.2.3 From bc792e612e78a24ae0b30cc5b85f2368379ba4d4 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 20 Jul 2012 17:27:37 -0700 Subject: kdb: Revive dmesg command The kgdb dmesg command is broken after the printk rework. The old logic in kdb code makes no sense in terms of current printk/logging storage format, and KDB simply hangs forever. This patch revives the command by switching to kmsg_dumper iterator. The code is now much more simpler and shorter. Signed-off-by: Anton Vorontsov Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_main.c | 91 ++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..df17c935d3c6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv) */ static int kdb_dmesg(int argc, const char **argv) { - char *syslog_data[4], *start, *end, c = '\0', *p; - int diag, logging, logsize, lines = 0, adjust = 0, n; + int diag; + int logging; + int lines = 0; + int adjust = 0; + int n = 0; + int skip = 0; + struct kmsg_dumper dumper = { .active = 1 }; + size_t len; + char buf[201]; if (argc > 2) return KDB_ARGCOUNT; @@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] - * logical start, end+1. */ - kdb_syslog_data(syslog_data); - if (syslog_data[2] == syslog_data[3]) - return 0; - logsize = syslog_data[1] - syslog_data[0]; - start = syslog_data[2]; - end = syslog_data[3]; -#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) - for (n = 0, p = start; p < end; ++p) { - c = *KDB_WRAP(p); - if (c == '\n') - ++n; - } - if (c != '\n') - ++n; + kmsg_dump_rewind(&dumper); + while (kmsg_dump_get_line(&dumper, 1, NULL, 0, NULL)) + n++; + if (lines < 0) { if (adjust >= n) kdb_printf("buffer only contains %d lines, nothing " @@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv) else if (adjust - lines >= n) kdb_printf("buffer only contains %d lines, last %d " "lines printed\n", n, n - adjust); - if (adjust) { - for (; start < end && adjust; ++start) { - if (*KDB_WRAP(start) == '\n') - --adjust; - } - if (start < end) - ++start; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - ++lines; - } - end = p; + skip = adjust; + lines = abs(lines); } else if (lines > 0) { - int skip = n - (adjust + lines); + skip = n - lines - adjust; + lines = abs(lines); if (adjust >= n) { kdb_printf("buffer only contains %d lines, " "nothing printed\n", n); @@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv) kdb_printf("buffer only contains %d lines, first " "%d lines printed\n", n, lines); } - for (; start < end && skip; ++start) { - if (*KDB_WRAP(start) == '\n') - --skip; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - --lines; - } - end = p; + } else { + lines = n; } - /* Do a line at a time (max 200 chars) to reduce protocol overhead */ - c = '\n'; - while (start != end) { - char buf[201]; - p = buf; - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - while (start < end && (c = *KDB_WRAP(start)) && - (p - buf) < sizeof(buf)-1) { - ++start; - *p++ = c; - if (c == '\n') - break; + + if (skip >= n || skip < 0) + return 0; + + kmsg_dump_rewind(&dumper); + while (kmsg_dump_get_line(&dumper, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; } - *p = '\0'; - kdb_printf("%s", buf); + if (!lines--) + break; + + kdb_printf("%.*s\n", (int)len - 1, buf); } - if (c != '\n') - kdb_printf("\n"); return 0; } -- cgit v1.2.3 From 1b499d05eecbe04969516717a8e15afb6ad80689 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 20 Jul 2012 17:27:54 -0700 Subject: printk: Remove kdb_syslog_data The function is no longer needed, so remove it. Signed-off-by: Anton Vorontsov Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_private.h | 1 - kernel/printk.c | 15 --------------- 2 files changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -205,7 +205,6 @@ extern char kdb_grep_string[]; extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; -extern void kdb_syslog_data(char *syslog_data[]); extern unsigned long kdb_task_state_string(const char *); extern char kdb_task_state_char (const struct task_struct *); extern unsigned long kdb_task_state(const struct task_struct *p, diff --git a/kernel/printk.c b/kernel/printk.c index 177fa49357a5..c8129678dfbf 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1192,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -#ifdef CONFIG_KGDB_KDB -/* kdb dmesg command needs access to the syslog buffer. do_syslog() - * uses locks so it cannot be used during debugging. Just tell kdb - * where the start and end of the physical and logical logs are. This - * is equivalent to do_syslog(3). - */ -void kdb_syslog_data(char *syslog_data[4]) -{ - syslog_data[0] = log_buf; - syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_first_idx; - syslog_data[3] = log_buf + log_next_idx; -} -#endif /* CONFIG_KGDB_KDB */ - static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) -- cgit v1.2.3 From 533827c921c34310f63e859e1d6d0feec439657d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 20 Jul 2012 17:28:07 -0700 Subject: printk: Implement some unlocked kmsg_dump functions If used from KDB, the locked variants are prone to deadlocks (suppose we got to the debugger w/ the logbuf lock held). So, we have to implement a few routines that grab no logbuf lock. Yet we don't need these functions in modules, so we don't export them. Signed-off-by: Anton Vorontsov Signed-off-by: Linus Torvalds --- kernel/printk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index c8129678dfbf..ac4bc9e79465 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -2510,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) } /** - * kmsg_dump_get_line - retrieve one kmsg log line + * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) * @dumper: registered kmsg dumper * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to @@ -2525,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) * * A return value of FALSE indicates that there are no more records to * read. + * + * The function is similar to kmsg_dump_get_line(), but grabs no locks. */ -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) +bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) { - unsigned long flags; struct log *msg; size_t l = 0; bool ret = false; @@ -2537,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, if (!dumper->active) goto out; - raw_spin_lock_irqsave(&logbuf_lock, flags); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; @@ -2545,10 +2545,8 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, } /* last entry */ - if (dumper->cur_seq >= log_next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (dumper->cur_seq >= log_next_seq) goto out; - } msg = log_from_idx(dumper->cur_idx); l = msg_print_text(msg, 0, syslog, line, size); @@ -2556,12 +2554,41 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); out: if (len) *len = l; return ret; } + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + return ret; +} EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** @@ -2663,6 +2690,24 @@ out: } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); +/** + * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + * + * The function is similar to kmsg_dump_rewind(), but grabs no locks. + */ +void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +{ + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; +} + /** * kmsg_dump_rewind - reset the interator * @dumper: registered kmsg dumper @@ -2676,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) unsigned long flags; raw_spin_lock_irqsave(&logbuf_lock, flags); - dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + kmsg_dump_rewind_nolock(dumper); raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -- cgit v1.2.3 From c064da47144b11be4697a4611f640086a663016a Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 20 Jul 2012 17:28:25 -0700 Subject: kdb: Switch to nolock variants of kmsg_dump functions The locked variants are prone to deadlocks (suppose we got to the debugger w/ the logbuf lock held), so let's switch to nolock variants. Signed-off-by: Anton Vorontsov Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index df17c935d3c6..1f91413edb87 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2072,8 +2072,8 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - kmsg_dump_rewind(&dumper); - while (kmsg_dump_get_line(&dumper, 1, NULL, 0, NULL)) + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) n++; if (lines < 0) { @@ -2105,8 +2105,8 @@ static int kdb_dmesg(int argc, const char **argv) if (skip >= n || skip < 0) return 0; - kmsg_dump_rewind(&dumper); - while (kmsg_dump_get_line(&dumper, 1, buf, sizeof(buf), &len)) { + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { if (skip) { skip--; continue; -- cgit v1.2.3 From 9a2e03d8ed518a61154f18d83d6466628e519f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 19 Jul 2012 13:52:53 -0700 Subject: kthread_worker: reorganize to prepare for flush_kthread_work() reimplementation Make the following two non-functional changes. * Separate out insert_kthread_work() from queue_kthread_work(). * Relocate struct kthread_flush_work and kthread_flush_work_fn() definitions above flush_kthread_work(). v2: Added lockdep_assert_held() in insert_kthread_work() as suggested by Andy Walls. Signed-off-by: Tejun Heo Acked-by: Andy Walls --- kernel/kthread.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702e..4bfbff36d447 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -378,6 +378,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* insert @work before @pos in @worker */ +static void insert_kthread_work(struct kthread_worker *worker, + struct kthread_work *work, + struct list_head *pos) +{ + lockdep_assert_held(&worker->lock); + + list_add_tail(&work->node, pos); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); +} + /** * queue_kthread_work - queue a kthread_work * @worker: target kthread_worker @@ -395,10 +408,7 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - list_add_tail(&work->node, &worker->work_list); - work->queue_seq++; - if (likely(worker->task)) - wake_up_process(worker->task); + insert_kthread_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); @@ -406,6 +416,18 @@ bool queue_kthread_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(queue_kthread_work); +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + /** * flush_kthread_work - flush a kthread_work * @work: work to flush @@ -436,18 +458,6 @@ void flush_kthread_work(struct kthread_work *work) } EXPORT_SYMBOL_GPL(flush_kthread_work); -struct kthread_flush_work { - struct kthread_work work; - struct completion done; -}; - -static void kthread_flush_work_fn(struct kthread_work *work) -{ - struct kthread_flush_work *fwork = - container_of(work, struct kthread_flush_work, work); - complete(&fwork->done); -} - /** * flush_kthread_worker - flush all current works on a kthread_worker * @worker: worker to flush -- cgit v1.2.3 From 46f3d976213452350f9d10b0c2780c2681f7075b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 19 Jul 2012 13:52:53 -0700 Subject: kthread_worker: reimplement flush_kthread_work() to allow freeing the work item being executed kthread_worker provides minimalistic workqueue-like interface for users which need a dedicated worker thread (e.g. for realtime priority). It has basic queue, flush_work, flush_worker operations which mostly match the workqueue counterparts; however, due to the way flush_work() is implemented, it has a noticeable difference of not allowing work items to be freed while being executed. While the current users of kthread_worker are okay with the current behavior, the restriction does impede some valid use cases. Also, removing this difference isn't difficult and actually makes the code easier to understand. This patch reimplements flush_kthread_work() such that it uses a flush_work item instead of queue/done sequence numbers. Signed-off-by: Tejun Heo --- kernel/kthread.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 4bfbff36d447..b579af57ea10 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -360,16 +360,12 @@ repeat: struct kthread_work, node); list_del_init(&work->node); } + worker->current_work = work; spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); work->func(work); - smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ - work->done_seq = work->queue_seq; - smp_mb(); /* mb worker-b1 paired with flush-b0 */ - if (atomic_read(&work->flushing)) - wake_up_all(&work->done); } else if (!freezing(current)) schedule(); @@ -386,7 +382,7 @@ static void insert_kthread_work(struct kthread_worker *worker, lockdep_assert_held(&worker->lock); list_add_tail(&work->node, pos); - work->queue_seq++; + work->worker = worker; if (likely(worker->task)) wake_up_process(worker->task); } @@ -436,25 +432,35 @@ static void kthread_flush_work_fn(struct kthread_work *work) */ void flush_kthread_work(struct kthread_work *work) { - int seq = work->queue_seq; + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + struct kthread_worker *worker; + bool noop = false; + +retry: + worker = work->worker; + if (!worker) + return; - atomic_inc(&work->flushing); + spin_lock_irq(&worker->lock); + if (work->worker != worker) { + spin_unlock_irq(&worker->lock); + goto retry; + } - /* - * mb flush-b0 paired with worker-b1, to make sure either - * worker sees the above increment or we see done_seq update. - */ - smp_mb__after_atomic_inc(); + if (!list_empty(&work->node)) + insert_kthread_work(worker, &fwork.work, work->node.next); + else if (worker->current_work == work) + insert_kthread_work(worker, &fwork.work, worker->work_list.next); + else + noop = true; - /* A - B <= 0 tests whether B is in front of A regardless of overflow */ - wait_event(work->done, seq - work->done_seq <= 0); - atomic_dec(&work->flushing); + spin_unlock_irq(&worker->lock); - /* - * rmb flush-b1 paired with worker-b0, to make sure our caller - * sees every change made by work->func(). - */ - smp_mb__after_atomic_dec(); + if (!noop) + wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(flush_kthread_work); -- cgit v1.2.3 From 6fec10a1a5866dda3cd6a825a521fc7c2f226ba5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 22 Jul 2012 10:16:34 -0700 Subject: workqueue: fix spurious CPU locality WARN from process_one_work() 25511a4776 "workqueue: reimplement CPU online rebinding to handle idle workers" added CPU locality sanity check in process_one_work(). It triggers if a worker is executing on a different CPU without UNBOUND or REBIND set. This works for all normal workers but rescuers can trigger this spuriously when they're serving the unbound or a disassociated global_cwq - rescuers don't have either flag set and thus its gcwq->cpu can be a different value including %WORK_CPU_UNBOUND. Fix it by additionally testing %GCWQ_DISASSOCIATED. Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" LKML-Refence: <20120721213656.GA7783@linux.vnet.ibm.com> --- kernel/workqueue.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 471996a81633..692d97628a10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1949,7 +1949,13 @@ __acquires(&gcwq->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif + /* + * Ensure we're on the correct CPU. DISASSOCIATED test is + * necessary to avoid spurious warnings from rescuers servicing the + * unbound or a disassociated gcwq. + */ WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + !(gcwq->flags & GCWQ_DISASSOCIATED) && raw_smp_processor_id() != gcwq->cpu); /* -- cgit v1.2.3 From 7266702805f9d824f92ce5c4069eca65d0f21d28 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Jul 2012 14:10:52 +0400 Subject: signal: make sure we don't get stopped with pending task_work Signed-off-by: Al Viro --- kernel/signal.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 677102789cf2..be4f856d52f8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } + if (unlikely(uprobe_deny_signal())) return 0; -- cgit v1.2.3 From 41f9d29f09ca0b22c3631e8a39676e74cda9bcc0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jun 2012 22:10:04 +0400 Subject: trimming task_work: kill ->data get rid of the only user of ->data; this is _not_ the final variant - in the end we'll have task_work and rcu_head identical and just use cred->rcu, at which point the separate allocation will be gone completely. Signed-off-by: Al Viro --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba39..d1dd54734ce7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -830,7 +830,7 @@ static int irq_thread(void *data) sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor, NULL); + init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { -- cgit v1.2.3 From 158e1645e07f3e9f7e4962d7a0997f5c3b98311b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jun 2012 09:24:13 +0400 Subject: trim task_work: get rid of hlist layout based on Oleg's suggestion; single-linked list, task->task_works points to the last element, forward pointer from said last element points to head. I'd still prefer much more regular scheme with two pointers in task_work, but... Signed-off-by: Al Viro --- kernel/fork.c | 2 +- kernel/task_work.c | 64 ++++++++++++++++++++++++++++-------------------------- 2 files changed, 34 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..bebabad59202 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1415,7 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_HLIST_HEAD(&p->task_works); + p->task_works = NULL; /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/task_work.c b/kernel/task_work.c index 82d1c794066d..9b8948dbdc60 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -19,7 +19,12 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify) */ raw_spin_lock_irqsave(&task->pi_lock, flags); if (likely(!(task->flags & PF_EXITING))) { - hlist_add_head(&twork->hlist, &task->task_works); + struct task_work *last = task->task_works; + struct task_work *first = last ? last->next : twork; + twork->next = first; + if (last) + last->next = twork; + task->task_works = twork; err = 0; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -34,51 +39,48 @@ struct task_work * task_work_cancel(struct task_struct *task, task_work_func_t func) { unsigned long flags; - struct task_work *twork; - struct hlist_node *pos; + struct task_work *last, *res = NULL; raw_spin_lock_irqsave(&task->pi_lock, flags); - hlist_for_each_entry(twork, pos, &task->task_works, hlist) { - if (twork->func == func) { - hlist_del(&twork->hlist); - goto found; + last = task->task_works; + if (last) { + struct task_work *q = last, *p = q->next; + while (1) { + if (p->func == func) { + q->next = p->next; + if (p == last) + task->task_works = q == p ? NULL : q; + res = p; + break; + } + if (p == last) + break; + q = p; + p = q->next; } } - twork = NULL; - found: raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - return twork; + return res; } void task_work_run(void) { struct task_struct *task = current; - struct hlist_head task_works; - struct hlist_node *pos; + struct task_work *p, *q; raw_spin_lock_irq(&task->pi_lock); - hlist_move_list(&task->task_works, &task_works); + p = task->task_works; + task->task_works = NULL; raw_spin_unlock_irq(&task->pi_lock); - if (unlikely(hlist_empty(&task_works))) + if (unlikely(!p)) return; - /* - * We use hlist to save the space in task_struct, but we want fifo. - * Find the last entry, the list should be short, then process them - * in reverse order. - */ - for (pos = task_works.first; pos->next; pos = pos->next) - ; - - for (;;) { - struct hlist_node **pprev = pos->pprev; - struct task_work *twork = container_of(pos, struct task_work, - hlist); - twork->func(twork); - if (pprev == &task_works.first) - break; - pos = container_of(pprev, struct hlist_node, next); + q = p->next; /* head */ + p->next = NULL; /* cut it */ + while (q) { + p = q->next; + q->func(q); + q = p; } } -- cgit v1.2.3 From 67d1214551e800f9fe7dc7c47a346d2df0fafed5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jun 2012 11:07:19 +0400 Subject: merge task_work and rcu_head, get rid of separate allocation for keyring case task_work and rcu_head are identical now; merge them (calling the result struct callback_head, rcu_head #define'd to it), kill separate allocation in security/keys since we can just use cred->rcu now. Signed-off-by: Al Viro --- kernel/irq/manage.c | 4 ++-- kernel/task_work.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d1dd54734ce7..814c9ef6bba1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } -static void irq_thread_dtor(struct task_work *unused) +static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; @@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused) */ static int irq_thread(void *data) { - struct task_work on_exit_work; + struct callback_head on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; diff --git a/kernel/task_work.c b/kernel/task_work.c index 9b8948dbdc60..76266fb665dc 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -3,7 +3,7 @@ #include int -task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) { unsigned long flags; int err = -ESRCH; @@ -19,8 +19,8 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify) */ raw_spin_lock_irqsave(&task->pi_lock, flags); if (likely(!(task->flags & PF_EXITING))) { - struct task_work *last = task->task_works; - struct task_work *first = last ? last->next : twork; + struct callback_head *last = task->task_works; + struct callback_head *first = last ? last->next : twork; twork->next = first; if (last) last->next = twork; @@ -35,16 +35,16 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify) return err; } -struct task_work * +struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { unsigned long flags; - struct task_work *last, *res = NULL; + struct callback_head *last, *res = NULL; raw_spin_lock_irqsave(&task->pi_lock, flags); last = task->task_works; if (last) { - struct task_work *q = last, *p = q->next; + struct callback_head *q = last, *p = q->next; while (1) { if (p->func == func) { q->next = p->next; @@ -66,7 +66,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) void task_work_run(void) { struct task_struct *task = current; - struct task_work *p, *q; + struct callback_head *p, *q; raw_spin_lock_irq(&task->pi_lock); p = task->task_works; -- cgit v1.2.3 From ed3e694d78cc75fa79bf29698631b146fd27aa35 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jun 2012 11:31:24 +0400 Subject: move exit_task_work() past exit_files() et.al. ... and get rid of PF_EXITING check in task_work_add(). Signed-off-by: Al Viro --- kernel/exit.c | 6 ++---- kernel/task_work.c | 30 +++++++++++------------------- 2 files changed, 13 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2f59cc334516..d17f6c4ddfa9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -953,14 +953,11 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes, and in - * task_work_add() to avoid the race with exit_task_work(). + * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_task_work(tsk); - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), @@ -995,6 +992,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_work(tsk); check_stack_usage(); exit_thread(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 76266fb665dc..fb396089f66a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,34 +5,26 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) { + struct callback_head *last, *first; unsigned long flags; - int err = -ESRCH; -#ifndef TIF_NOTIFY_RESUME - if (notify) - return -ENOTSUPP; -#endif /* - * We must not insert the new work if the task has already passed - * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() - * and check PF_EXITING under pi_lock. + * Not inserting the new work if the task has already passed + * exit_task_work() is the responisbility of callers. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - if (likely(!(task->flags & PF_EXITING))) { - struct callback_head *last = task->task_works; - struct callback_head *first = last ? last->next : twork; - twork->next = first; - if (last) - last->next = twork; - task->task_works = twork; - err = 0; - } + last = task->task_works; + first = last ? last->next : twork; + twork->next = first; + if (last) + last->next = twork; + task->task_works = twork; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ - if (likely(!err) && notify) + if (notify) set_notify_resume(task); - return err; + return 0; } struct callback_head * -- cgit v1.2.3 From a2d4c71d1559426155e5da8db3265bfa0d8d398d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jun 2012 11:33:29 +0400 Subject: deal with task_work callbacks adding more work It doesn't matter on normal return to userland path (we'll recheck the NOTIFY_RESUME flag anyway), but in case of exit_task_work() we'll need that as soon as we get callbacks capable of triggering more task_work_add(). Signed-off-by: Al Viro --- kernel/task_work.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index fb396089f66a..91d4e1742a0c 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -60,19 +60,21 @@ void task_work_run(void) struct task_struct *task = current; struct callback_head *p, *q; - raw_spin_lock_irq(&task->pi_lock); - p = task->task_works; - task->task_works = NULL; - raw_spin_unlock_irq(&task->pi_lock); + while (1) { + raw_spin_lock_irq(&task->pi_lock); + p = task->task_works; + task->task_works = NULL; + raw_spin_unlock_irq(&task->pi_lock); - if (unlikely(!p)) - return; + if (unlikely(!p)) + return; - q = p->next; /* head */ - p->next = NULL; /* cut it */ - while (q) { - p = q->next; - q->func(q); - q = p; + q = p->next; /* head */ + p->next = NULL; /* cut it */ + while (q) { + p = q->next; + q->func(q); + q = p; + } } } -- cgit v1.2.3 From d35be8bab9b0ce44bed4b9453f86ebf64062721e Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:26 +0530 Subject: CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed masks as and when necessary to ensure that the tasks belonging to the cpusets have some place (online CPUs) to run on. And regular CPU hotplug is destructive in the sense that the kernel doesn't remember the original cpuset configurations set by the user, across hotplug operations. However, suspend/resume (which uses CPU hotplug) is a special case in which the kernel has the responsibility to restore the system (during resume), to exactly the same state it was in before suspend. In order to achieve that, do the following: 1. Don't modify cpusets during suspend/resume. At all. In particular, don't move the tasks from one cpuset to another, and don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets during the CPU hotplug operations that are carried out in the suspend/resume path. 2. However, cpusets and sched domains are related. We just want to avoid altering cpusets alone. So, to keep the sched domains updated, build a single sched domain (containing all active cpus) during each of the CPU hotplug operations carried out in s/r path, effectively ignoring the cpusets' cpus_allowed masks. (Since userspace is frozen while doing all this, it will go unnoticed.) 3. During the last CPU online operation during resume, build the sched domains by looking up the (unaltered) cpusets' cpus_allowed masks. That will bring back the system to the same original state as it was in before suspend. Ultimately, this will not only solve the cpuset problem related to suspend resume (ie., restores the cpusets to exactly what it was before suspend, by not touching it at all) but also speeds up suspend/resume because we avoid running cpuset update code for every CPU being offlined/onlined. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 3 +++ kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd12..746d1eeb5dbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2054,6 +2054,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1ba..4c1d80c6b318 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7097,34 +7097,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) -- cgit v1.2.3 From 80d1fa6463d934969b7aebf04107fc133463f0f6 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:41 +0530 Subject: cpusets, hotplug: Implement cpuset tree traversal in a helper function At present, the functions that deal with cpusets during CPU/Mem hotplug are quite messy, since a lot of the functionality is mixed up without clear separation. And this takes a toll on optimization as well. For example, the function cpuset_update_active_cpus() is called on both CPU offline and CPU online events; and it invokes scan_for_empty_cpusets(), which makes sense only for CPU offline events. And hence, the current code ends up unnecessarily traversing the cpuset tree during CPU online also. As a first step towards cleaning up those functions, encapsulate the cpuset tree traversal in a helper function, so as to facilitate upcoming changes. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141635.3692.893.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 746d1eeb5dbe..ba96349aa522 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1989,6 +1989,32 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) move_member_tasks_to_cpuset(cs, parent); } +/* + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + /* * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpuset must be moved to a parent cpuset. @@ -2008,19 +2034,11 @@ static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); - } + while ((cp = cpuset_next(&queue)) != NULL) { /* Continue past cpusets with all cpus, mems online */ if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && -- cgit v1.2.3 From 7ddf96b02fe8dd441f452deef879040def5f7b34 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:55 +0530 Subject: cpusets, hotplug: Restructure functions that are invoked during hotplug Separate out the cpuset related handling for CPU/Memory online/offline. This also helps us exploit the most obvious and basic level of optimization that any notification mechanism (CPU/Mem online/offline) has to offer us: "We *know* why we have been invoked. So stop pretending that we are lost, and do only the necessary amount of processing!". And while at it, rename scan_for_empty_cpusets() to scan_cpusets_upon_hotplug(), which is more appropriate considering how it is restructured. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141650.3692.48637.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 88 +++++++++++++++++++++++++++++++++++++---------------- kernel/sched/core.c | 4 +-- 2 files changed, 63 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba96349aa522..ba0a4d74d262 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -2016,8 +2022,10 @@ static struct cpuset *cpuset_next(struct list_head *queue) /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2030,38 +2038,58 @@ static struct cpuset *cpuset_next(struct list_head *queue) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while ((cp = cpuset_next(&queue)) != NULL) { + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); + } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2080,8 +2108,11 @@ static void scan_for_empty_cpusets(struct cpuset *root) * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2091,7 +2122,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2122,9 +2156,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c1d80c6b318..4b4a63d34396 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7134,7 +7134,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); + cpuset_update_active_cpus(true); break; default: return NOTIFY_DONE; @@ -7147,7 +7147,7 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, { switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); + cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: num_cpus_frozen++; -- cgit v1.2.3 From a1cd2b13f754b2c56fb87b8c4912c015f8f57c0c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:47:03 +0530 Subject: cpusets: Remove/update outdated comments cpuset_track_online_cpus() is no longer present. So remove the outdated comment and replace it with reference to cpuset_update_active_cpus() which is its equivalent. Also, we don't lack memory hot-unplug anymore. And David Rientjes pointed out how it is dealt with. So update that comment as well. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141700.3692.98192.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba0a4d74d262..f33c7153b6d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2034,9 +2034,8 @@ static struct cpuset *cpuset_next(struct list_head *queue) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ static void scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) @@ -2137,7 +2136,7 @@ void cpuset_update_active_cpus(bool cpu_online) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) -- cgit v1.2.3 From 970e178985cadbca660feb02f4d2ee3a09f7fdda Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 12 Jun 2012 05:18:32 +0200 Subject: sched: Improve scalability via 'CPU buddies', which withstand random perturbations Traversing an entire package is not only expensive, it also leads to tasks bouncing all over a partially idle and possible quite large package. Fix that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try to motivate that one other CPU, if it's busy, tough, it may then try its SMT sibling, but that's all this optimization is allowed to cost. Sibling cache buddies are cross-wired to prevent bouncing. 4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better: clients 1 2 4 8 16 32 64 128 .......................................................................... pre 30 41 118 645 3769 6214 12233 14312 post 299 603 1211 2418 4697 6847 11606 14557 A nice increase in performance. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/sched/fair.c | 28 +++++++--------------------- 2 files changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b4a63d34396..536b213f0ce5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..dd00aaf44fda 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } -- cgit v1.2.3 From 85c1e7dae165acd004429f81fe52bfbf55b57a98 Mon Sep 17 00:00:00 2001 From: Prashanth Nageshappa Date: Tue, 19 Jun 2012 17:47:34 +0530 Subject: sched: Reorder 'struct lb_env' members to reduce its size Members of 'struct lb_env' are not in appropriate order to reuse compiler added padding on 64bit architectures. In this patch we reorder those struct members and help reduce the size of the structure from 96 bytes to 80 bytes on 64 bit architectures. Suggested-by: Srivatsa Vaddagiri Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06DDE.7000403@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd00aaf44fda..9361669d4242 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3058,8 +3058,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; -- cgit v1.2.3 From bbf18b19495942cc730e8ff11fc3ffadf20cbfe1 Mon Sep 17 00:00:00 2001 From: Prashanth Nageshappa Date: Tue, 19 Jun 2012 17:52:07 +0530 Subject: sched: Reset loop counters if all tasks are pinned and we need to redo load balance While load balancing, if all tasks on the source runqueue are pinned, we retry after excluding the corresponding source cpu. However, loop counters env.loop and env.loop_break are not reset before retrying, which can lead to failure in moving the tasks. In this patch we reset env.loop and env.loop_break to their inital values before we retry. Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06EEF.2090709@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9361669d4242..f9f9aa0edf3c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4288,8 +4288,11 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } -- cgit v1.2.3 From 88b8dac0a14c511ff41486b83a8c3d688936eec0 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 19 Jun 2012 17:43:15 +0530 Subject: sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task Current load balance scheme requires only one cpu in a sched_group (balance_cpu) to look at other peer sched_groups for imbalance and pull tasks towards itself from a busy cpu. Tasks thus pulled by balance_cpu could later get picked up by cpus that are in the same sched_group as that of balance_cpu. This scheme however fails to pull tasks that are not allowed to run on balance_cpu (but are allowed to run on other cpus in its sched_group). That can affect fairness and in some worst case scenarios cause starvation. Consider a two core (2 threads/core) system running tasks as below: Core0 Core1 / \ / \ C0 C1 C2 C3 | | | | v v v v F0 T1 F1 [idle] T2 F0 = SCHED_FIFO task (pinned to C0) F1 = SCHED_FIFO task (pinned to C2) T1 = SCHED_OTHER task (pinned to C1) T2 = SCHED_OTHER task (pinned to C1 and C2) F1 could become a cpu hog, which will starve T2 unless C1 pulls it. Between C0 and C1 however, C0 is required to look for imbalance between cores, which will fail to pull T2 towards Core0. T2 will starve eternally in this case. The same scenario can arise in presence of non-rt tasks as well (say we replace F1 with high irq load). We tackle this problem by having balance_cpu move pinned tasks to one of its sibling cpus (where they can run). We first check if load balance goal can be met by ignoring pinned tasks, failing which we retry move_tasks() with a new env->dst_cpu. This patch modifies load balance semantics on who can move load towards a given cpu in a given sched_domain. Before this patch, a given_cpu or a ilb_cpu acting on behalf of an idle given_cpu is responsible for moving load to given_cpu. With this patch applied, balance_cpu can in addition decide on moving some load to a given_cpu. There is a remote possibility that excess load could get moved as a result of this (balance_cpu and given_cpu/ilb_cpu deciding *independently* and at *same* time to move some load to a given_cpu). However we should see less of such conflicting decisions in practice and moreover subsequent load balance cycles should correct the excess load moved to given_cpu. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06CDB.2060605@linux.vnet.ibm.com [ minor edits ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f9f9aa0edf3c..22321db64952 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; @@ -3064,6 +3065,8 @@ struct lb_env { int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; unsigned int flags; @@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -4253,6 +4281,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4270,7 +4299,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4282,8 +4317,43 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { -- cgit v1.2.3 From 8323f26ce3425460769605a6aece7a174edaa7d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 13:36:05 +0200 Subject: sched: Fix race in task_group() Stefan reported a crash on a kernel before a3e5d1091c1 ("sched: Don't call task_group() too many times in set_task_rq()"), he found the reason to be that the multiple task_group() invocations in set_task_rq() returned different values. Looking at all that I found a lack of serialization and plain wrong comments. The below tries to fix it using an extra pointer which is updated under the appropriate scheduler locks. Its not pretty, but I can't really see another way given how all the cgroup stuff works. Reported-and-tested-by: Stefan Bader Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340364965.18025.71.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/sched.h | 23 ++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 536b213f0ce5..5d011ef4c0df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -7658,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7672,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435a..c35a1a7dd4d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -- cgit v1.2.3 From 8ded2bbc1845e19c771eb55209aab166ef011243 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 25 Jul 2012 10:40:34 -0400 Subject: posix_types.h: Cleanup stale __NFDBITS and related definitions Recently, glibc made a change to suppress sign-conversion warnings in FD_SET (glibc commit ceb9e56b3d1). This uncovered an issue with the kernel's definition of __NFDBITS if applications #include after including . A build failure would be seen when passing the -Werror=sign-compare and -D_FORTIFY_SOURCE=2 flags to gcc. It was suggested that the kernel should either match the glibc definition of __NFDBITS or remove that entirely. The current in-kernel uses of __NFDBITS can be replaced with BITS_PER_LONG, and there are no uses of the related __FDELT and __FDMASK defines. Given that, we'll continue the cleanup that was started with commit 8b3d1cda4f5f ("posix_types: Remove fd_set macros") and drop the remaining unused macros. Additionally, linux/time.h has similar macros defined that expand to nothing so we'll remove those at the same time. Reported-by: Jeff Law Suggested-by: Linus Torvalds CC: Signed-off-by: Josh Boyer [ .. and fix up whitespace as per akpm ] Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d17f6c4ddfa9..f65345f9e5bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -483,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; -- cgit v1.2.3