From dbda92d16f8655044e082930e4e9d244b87fde77 Mon Sep 17 00:00:00 2001 From: "Bu, Yitian" Date: Mon, 18 Feb 2013 12:53:37 +0000 Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw") reintroduced a lock inversion problem which was fixed in commit 0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This happened probably when fixing up patch rejects. Restore the ordering and unlock logbuf_lock before releasing console_sem. Signed-off-by: ybu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com Signed-off-by: Thomas Gleixner --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe8..e698e80d8428 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); if (wake) up(&console_sem); - raw_spin_unlock(&logbuf_lock); return retval; } -- cgit v1.2.3 From 5d33b883aed81c6fbcd09c6f7c3619eee850a7e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:43 +0000 Subject: clocksource: Always verify highres capability If a clocksource has a (wrong) high rating, but can't be used as a timebase for oneshot tick mode, it is unconditionally selected even when the system is already in oneshot tick mode. This causes full system failure. Verify the clocksource selection against the oneshot mode. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.635040849@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..dda5c7130d93 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,6 +553,26 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET +static struct clocksource *clocksource_find_best(bool oneshot) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + /** * clocksource_select - Select the best clocksource available * @@ -563,12 +583,14 @@ static u64 clocksource_max_deferment(struct clocksource *cs) */ static void clocksource_select(void) { + bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; - if (!finished_booting || list_empty(&clocksource_list)) + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (strcmp(cs->name, override_name) != 0) @@ -578,8 +600,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " -- cgit v1.2.3 From ba919d1caa2e624eb8c6cae1f2ce0a253e697d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Let timekeeping_notify return success/error timekeeping_notify() can fail due cs->enable() failure. Though the caller does not notice and happily keeps the wrong clocksource as the current one. Let the caller know about failure, so the current clocksource will be shown correctly in sysfs. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.696321912@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 6 +++--- kernel/time/timekeeping.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dda5c7130d93..1923a340bd91 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -611,10 +611,10 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..da6e10c7a378 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -648,14 +648,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** -- cgit v1.2.3 From 09ac369c825d9d593404306d59062d854b321e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Add module refcount Add a module refcount, so the current clocksource cannot be removed unconditionally. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.762417789@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da6e10c7a378..933efa4071c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -627,11 +627,20 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } timekeeping_update(tk, true, true); -- cgit v1.2.3 From f5a2e34375a5e2b711aea488ac3ae50eeba6d57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Allow clocksource select to skip current clocksource Preparatory patch for clocksource unbind support. Split out code from clocksource_select and modify it, so it skips the current clocksource on request and tries to find a fallback clocksource. Convert all existing users. No functional change. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.834965397@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1923a340bd91..9782997cb6cf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,7 +553,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -566,6 +566,8 @@ static struct clocksource *clocksource_find_best(bool oneshot) * the best rating. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; return cs; @@ -573,26 +575,20 @@ static struct clocksource *clocksource_find_best(bool oneshot) return NULL; } -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static void __clocksource_select(bool skipcur) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot); + best = clocksource_find_best(oneshot, skipcur); if (!best) return; /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -618,6 +614,19 @@ static void clocksource_select(void) } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } -- cgit v1.2.3 From 29b5407819f59731c9423238fae03b756822708c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Split out user string input Split out the user string input for clocksource override. Preparatory patch for unbind. [ jstultz: Fix an off by one error ] Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.895851338@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9782997cb6cf..d7f1a45c2fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +#define CS_NAME_LEN 32 +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -838,6 +839,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -852,22 +870,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = clocksource_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3 From 7eaeb34305dee26634f7c98ae62646da5cebe91d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Provide unbind interface in sysfs With the module refcount held for the current clocksource there is no way to unload the module. Provide a sysfs interface which allows to unbind the clocksource. One could argue that the clocksource override could be (ab)used to do so, but the clocksource override cannot be used from the kernel itself, while an unbind function can be used to programmatically check whether a clocksource can be shutdown or not. The unbind functionality uses the new skip current feature of clocksource_select and verifies that a fallback clocksource has been installed. If the clocksource which should be unbound is the current clocksource and no fallback can be found, unbind returns -EBUSY. This does not support the unbinding of a clocksource which is used as the watchdog clocksource. No point in fostering crappy hardware. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.964218245@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d7f1a45c2fa5..791d1aeb17ac 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -440,6 +440,11 @@ static int clocksource_watchdog_kthread(void *data) return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -451,6 +456,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -628,6 +634,11 @@ static void clocksource_select(void) return __clocksource_select(false); } +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -803,6 +814,29 @@ void clocksource_change_rating(struct clocksource *cs, int rating) } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered @@ -883,6 +917,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = clocksource_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused @@ -925,6 +993,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -948,6 +1018,9 @@ static int __init init_clocksource_sysfs(void) error = device_create_file( &device_clocksource, &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) error = device_create_file( &device_clocksource, -- cgit v1.2.3 From a89c7edbe7d7aa80f507915f3dd801211b116b79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Let clocksource_unregister() return success/error The unregister call can fail, if the clocksource is the current one and there is no replacement clocksource available. It can also fail, if the clocksource is the watchdog clocksource and I'm not going to provide support for this. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.029915527@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 791d1aeb17ac..31b90332f47b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -389,28 +389,17 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -841,13 +830,15 @@ static int clocksource_unbind(struct clocksource *cs) * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); -- cgit v1.2.3 From 7172a286ced0c1f4f239a0fa09db54ed37d3ead2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:47 +0000 Subject: clockevents: Get rid of the notifier chain 7+ years and still a single user. Kill it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.098520211@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 35 +++-------------------------------- kernel/time/tick-broadcast.c | 5 ++--- kernel/time/tick-common.c | 30 +++++------------------------- kernel/time/tick-internal.h | 7 ++++--- 4 files changed, 14 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..dd70b4842c62 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "tick-internal.h" @@ -23,10 +22,6 @@ /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); @@ -232,30 +227,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - /* * Called after a notify add to make devices available which were * released from the notifier call. @@ -269,7 +240,7 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); } } @@ -290,7 +261,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -433,7 +404,7 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); + tick_notify(reason, arg); switch (reason) { case CLOCK_EVT_NOTIFY_CPU_DEAD: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..3500caaa0bfd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,7 +64,7 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; @@ -72,7 +72,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) @@ -90,7 +90,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..dbf4e18d5101 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -208,11 +208,11 @@ static void tick_setup_device(struct tick_device *td, /* * Check, if the new registered device should be used. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; + int cpu; unsigned long flags; raw_spin_lock_irqsave(&tick_device_lock, flags); @@ -275,18 +275,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_oneshot_notify(); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - + tick_install_broadcast_device(newdev); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; } /* @@ -360,17 +356,10 @@ static void tick_resume(void) raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) +void tick_notify(unsigned long reason, void *dev) { switch (reason) { - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -404,21 +393,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, default: break; } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..60742fe6f63d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_notify(unsigned long reason, void *dev); +extern void tick_check_new_device(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); @@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) -- cgit v1.2.3 From 7126cac426137633e470167524e7bcb590fd49b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Simplify locking Now that the notifier chain is gone there are no other users and it's pointless to nest tick_device_lock inside of clockevents_lock because there is no other use case. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.162888472@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index dbf4e18d5101..170a4bdfa99e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -206,16 +205,14 @@ static void tick_setup_device(struct tick_device *td, } /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -273,8 +270,6 @@ void tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); return; out_bc: @@ -282,7 +277,6 @@ out_bc: * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* @@ -311,9 +305,7 @@ static void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -325,26 +317,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -353,9 +339,11 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } +/* + * Called with clockevents_lock held and interrupts disabled + */ void tick_notify(unsigned long reason, void *dev) { switch (reason) { -- cgit v1.2.3 From 8c53daf63f56791ed47fc585206ef3049489612f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Move the tick_notify() switch case to clockevents_notify() No need to call another function and have duplicated cases. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.235746557@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 28 ++++++++++++++++++++++++- kernel/time/tick-common.c | 50 ++++----------------------------------------- kernel/time/tick-internal.h | 5 ++++- 3 files changed, 35 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dd70b4842c62..0e3a8448e115 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -404,10 +404,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 170a4bdfa99e..84c7cfca4d7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -284,7 +284,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -301,7 +301,7 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; @@ -319,14 +319,14 @@ static void tick_shutdown(unsigned int *cpup) } } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); clockevents_shutdown(td->evtdev); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); int broadcast = tick_resume_broadcast(); @@ -341,48 +341,6 @@ static void tick_resume(void) } } -/* - * Called with clockevents_lock held and interrupts disabled - */ -void tick_notify(unsigned long reason, void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } -} - /** * tick_init - initialize the tick control */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 60742fe6f63d..06bfc8802dfb 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,8 +18,11 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); -extern void tick_notify(unsigned long reason, void *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); extern void clockevents_shutdown(struct clock_event_device *dev); -- cgit v1.2.3 From ccf33d6880f39a35158fff66db13000ae4943fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Add module refcount We want to be able to remove clockevent modules as well. Add a refcount so we don't remove a module with an active clock event device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.307435149@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 1 + kernel/time/tick-broadcast.c | 3 +++ kernel/time/tick-common.c | 4 ++++ 3 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0e3a8448e115..89e394caa769 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -357,6 +357,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3500caaa0bfd..0e374cd2e0ef 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -73,6 +74,8 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return; + if (!try_module_get(dev->owner)) + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 84c7cfca4d7d..433a1e11d13b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -257,6 +258,9 @@ void tick_check_new_device(struct clock_event_device *newdev) goto out_bc; } + if (!try_module_get(newdev->owner)) + return; + /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do -- cgit v1.2.3 From 501f867064e95f9a6f540e60705be0937280e7ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Provide sysfs interface Provide a simple sysfs interface for the clockevent devices. Show the current active clockevent device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.371634778@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 89e394caa769..0a23f4f29934 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -460,4 +461,89 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ -- cgit v1.2.3 From 45cb8e01b2ecef1c2afb18333e95793fa1a90281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Split out selection logic Split out the clockevent device selection logic. Preparatory patch to allow unbinding active clockevent devices. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.431796247@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 25 ++++++++++++---- kernel/time/tick-common.c | 69 +++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0e374cd2e0ef..d067c01586f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -65,19 +65,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!tick_check_broadcast_device(cur, dev)) return; + if (!try_module_get(dev->owner)) return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 433a1e11d13b..c34021650348 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,37 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* Use the higher rated one */ + return !curdev || newdev->rating > curdev->rating; +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. @@ -223,40 +254,12 @@ void tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; if (!try_module_get(newdev->owner)) return; -- cgit v1.2.3 From 03e13cf5ee60584fe0c831682c67212effb7fca4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Implement unbind functionality Provide a sysfs interface to allow unbinding of clockevent devices. The device is unbound if it is unused or if there is a replacement device available. Unbinding of broadcast devices is not supported as we don't want to foster that nonsense. If no replacement device is available the unbind returns -EBUSY. Unbind is available from the kernel and through sysfs, which is necessary to drop the module refcount. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.499216659@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/clocksource.c | 9 ++-- kernel/time/tick-common.c | 24 +++++++++ kernel/time/tick-internal.h | 7 +++ 4 files changed, 161 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0a23f4f29934..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -25,6 +25,13 @@ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -245,6 +252,90 @@ static void clockevents_notify_released(void) } } +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -487,6 +578,38 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, } static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", @@ -529,6 +652,8 @@ static int __init tick_init_sysfs(void) err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 31b90332f47b..6d05b00410cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include #include +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,7 +176,6 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -#define CS_NAME_LEN 32 static char override_name[CS_NAME_LEN]; static int finished_booting; @@ -864,7 +865,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -899,7 +900,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, mutex_lock(&clocksource_mutex); - ret = clocksource_get_uname(buf, override_name, count); + ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) clocksource_select(); @@ -925,7 +926,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, char name[CS_NAME_LEN]; size_t ret; - ret = clocksource_get_uname(buf, name, count); + ret = sysfs_get_uname(buf, name, count); if (ret < 0) return ret; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c34021650348..5edfb4806032 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,17 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { @@ -236,6 +247,19 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 06bfc8802dfb..be1690eaecff 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -11,6 +11,8 @@ extern seqlock_t jiffies_lock; #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 +#define CS_NAME_LEN 32 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; @@ -23,9 +25,14 @@ extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From c7e99fc75de8882bc4104455ace366d9d3599a96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:28:02 +0200 Subject: clockevents: Define CS_NAME_LEN unconditionally Unbreak architectures which do not use clockevents, but require to build some of the core timekeeping infrastructure Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be1690eaecff..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,13 +6,13 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 -#define CS_NAME_LEN 32 - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; -- cgit v1.2.3 From 1eaff67266b6b6c97bbd33cf2c20577822836413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:48:46 +0200 Subject: clocksource: Implement clocksource_select_fallback() for CONFIG_ARCH_USES_GETTIMEOFFSET=y commit 7eaeb34305 (clocksource: Provide unbind interface in sysfs) implemented clocksource_select_fallback() which is not defined for CONFIG_ARCH_USES_GETTIMEOFFSET=y. Add an empty inline function for that. Reported-by: Ingo Molnar Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6d05b00410cc..e713ef7d19a7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -632,6 +632,7 @@ static void clocksource_select_fallback(void) #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif -- cgit v1.2.3 From 6cffe00f7d4e24679eae6b7aae4caaf915288256 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Wed, 15 May 2013 14:38:11 -0700 Subject: alarmtimer: Add functions for timerfd support Add functions needed for hooking up alarmtimer to timerfd: * alarm_restart: Similar to hrtimer_restart, restart an alarmtimer after the expires time has already been updated (as with alarm_forward). * alarm_forward_now: Similar to hrtimer_forward_now, move the expires time forward to an interval from the current time of the associated clock. * alarm_start_relative: Start an alarmtimer with an expires time relative to the current time of the associated clock. * alarm_expires_remaining: Similar to hrtimer_expires_remaining, return the amount of time remaining until alarm expiry. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..3e5cba274475 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,12 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -305,7 +311,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, } /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -324,6 +330,31 @@ int alarm_start(struct alarm *alarm, ktime_t start) return ret; } +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled @@ -394,6 +425,12 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) return overrun; } +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} -- cgit v1.2.3 From 5c83545f24ab3dd67e0ae0e2b795fea750f08c34 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 21 May 2013 22:32:14 -0700 Subject: power: Add option to log time spent in suspend Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.markovic@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/timekeeping.c | 2 ++ kernel/time/timekeeping_debug.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping_internal.h | 14 ++++++++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..d52ac8bf0006 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 933efa4071c3..838fc0777b68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -851,6 +852,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3 From 5e1cda5b8ae93f5f02e8c5a30390ac9b4d2c20e6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:53 +0100 Subject: irqdomain: Relax failure path on setting up mappings Commit 98aa468e, "irqdomain: Support for static IRQ mapping and association" introduced an API for directly associating blocks of hwirqs to linux irqs. However, if any irq in that block failed to map (say if the mapping functions returns an error because the irq is already mapped) then the whole thing will fail and roll back. This is probably too aggressive since there are valid reasons why a mapping may fail. ie. Firmware may have a particular IRQ marked as unusable. This patch drops the error path out of irq_domain_associate(). If a mapping fails, then it is simply skipped. There is no reason to fail the entire allocation. v2: Still output an information message on failed mappings and make sure attempted mapping gets cleared out of the irq_data structure. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/irqdomain.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 20b677dd0b27..61d6d3c80fee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -464,23 +464,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not - * be mapped. - * - * Since on some platforms we blindly try to map everything - * we end up with a log full of backtraces. - * - * So instead, we silently fail on -EPERM, it is the - * responsibility of the PIC driver to display a relevant - * message if needed. + * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + of_node_full_name(domain->of_node), hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; - goto err_unmap; + continue; } } -- cgit v1.2.3 From 9bbf877d3b6b8c5991000296f40a3f0fe66fa89b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 12:10:24 +0100 Subject: irqdomain: Replace LEGACY mapping with LINEAR The LEGACY mapping unnecessarily complicates the irqdomain code and can easily be implemented with a linear mapping. By ripping it out and replacing it with the LINEAR mapping the object size of irqdomain.c shrinks by about 330 bytes (ARMv7) which offsets the additional allocation required by the linear map. It also makes it possible for current LEGACY map users to pre-allocate irq_descs for a subset of the hwirqs and dynamically allocate the rest as needed. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- kernel/irq/irqdomain.c | 84 ++++---------------------------------------------- 1 file changed, 6 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 61d6d3c80fee..1ac8cf41b9a5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -82,13 +82,6 @@ void irq_domain_remove(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* - * Legacy domains don't manage their own irq_desc - * allocations, we expect the caller to handle irq_desc - * freeing on their own. - */ - break; case IRQ_DOMAIN_MAP_TREE: /* * radix_tree_delete() takes care of destroying the root @@ -122,17 +115,6 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; - int size = domain->revmap_data.legacy.size; - - if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) - return 0; - return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} - /** * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. @@ -213,57 +195,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int i; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", + first_irq, first_irq + size - 1, + (int)first_hwirq, (int)first_hwirq + size -1); + + domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); if (!domain) return NULL; - domain->revmap_data.legacy.first_irq = first_irq; - domain->revmap_data.legacy.first_hwirq = first_hwirq; - domain->revmap_data.legacy.size = size; - - mutex_lock(&irq_domain_mutex); - /* Verify that all the irqs are available */ - for (i = 0; i < size; i++) { - int irq = first_irq + i; - struct irq_data *irq_data = irq_get_irq_data(irq); - - if (WARN_ON(!irq_data || irq_data->domain)) { - mutex_unlock(&irq_domain_mutex); - irq_domain_free(domain); - return NULL; - } - } + WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); - /* Claim all of the irqs before registering a legacy domain */ - for (i = 0; i < size; i++) { - struct irq_data *irq_data = irq_get_irq_data(first_irq + i); - irq_data->hwirq = first_hwirq + i; - irq_data->domain = domain; - } - mutex_unlock(&irq_domain_mutex); - - for (i = 0; i < size; i++) { - int irq = first_irq + i; - int hwirq = first_hwirq + i; - - /* IRQ0 gets ignored */ - if (!irq) - continue; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - if (ops->map) - ops->map(domain, irq, hwirq); - - /* Clear norequest flags */ - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } - - irq_domain_add(domain); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); @@ -492,10 +434,6 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, } return 0; - - err_unmap: - irq_domain_disassociate_many(domain, irq_base, i); - return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -575,10 +513,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } - /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return irq_domain_legacy_revmap(domain, hwirq); - /* Allocate a virtual interrupt number */ hint = hwirq % nr_irqs; if (hint == 0) @@ -706,10 +640,6 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - /* Never unmap legacy interrupts */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } @@ -732,8 +662,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - return irq_domain_legacy_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_TREE: -- cgit v1.2.3 From 0bb4afb45dd1add73ca643a865daa38716aeff0c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 14:23:30 +0100 Subject: irqdomain: Add a name field This patch adds a name field to the irq_domain structure to help mere mortals understand the mappings between irq domains and virqs. It also converts a number of places that have open-coded some kind of fudging an irqdomain name to use the new field. This means a more consistent display of names in irq domain log messages and debugfs output. Signed-off-by: Grant Likely --- kernel/irq/generic-chip.c | 1 + kernel/irq/irqdomain.c | 19 ++++++------------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 95575d8d5392..ca98cc5d6308 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -305,6 +305,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } + d->name = name; return 0; } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ac8cf41b9a5..b1b5e6793fd2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -410,12 +410,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - of_node_full_name(domain->of_node), hwirq, virq, ret); + domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; continue; } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; } switch (domain->revmap_type) { @@ -708,8 +711,6 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - const char *p; - static const char none[] = "none"; void *data; int i; @@ -731,20 +732,12 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain) - p = of_node_full_name(desc->irq_data.domain->of_node); - else - p = none; - seq_printf(m, "%s\n", p); + seq_printf(m, "%s\n", desc->irq_data.domain->name); } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From cef5075c8c238ffd04c86a77a5a9bdbd18031137 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Jul 2012 17:24:31 +0100 Subject: irqdomain: merge linear and tree reverse mappings. Keeping them separate makes irq_domain more complex and adds a lot of code (as proven by the diffstat). Merging them simplifies the whole scheme. This change makes it so both the tree and linear methods can be used by the same irq_domain instance. If the hwirq is less than the ->linear_size, then the linear map is used to reverse map the hwirq. Otherwise the radix tree is used. The test for which map to use is no more expensive that the existing code, so the performance of fast path is preserved. It also means that complex interrupt controllers can use both the linear map and a tree in the same domain. This may be useful for an interrupt controller with a base set of core irqs and a large number of GPIOs which might be used as irqs. The linear map could cover the core irqs, and the tree used for thas irqs. The linear map could cover the core irqs, and the tree used for the gpios. v2: Drop reorganization of revmap data Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- kernel/irq/irqdomain.c | 107 ++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b1b5e6793fd2..5a1d8ec8509e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -34,22 +34,24 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, + unsigned int revmap_type, int size, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; - domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, - of_node_to_nid(of_node)); + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), + GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; /* Fill structure */ + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); domain->revmap_type = revmap_type; domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->revmap_data.linear.size = size; return domain; } @@ -81,22 +83,12 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_TREE: - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_data.tree.height); - break; - case IRQ_DOMAIN_MAP_LINEAR: - kfree(domain->revmap_data.linear.revmap); - domain->revmap_data.linear.size = 0; - break; - case IRQ_DOMAIN_MAP_NOMAP: - break; - } + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_data.tree.height); list_del(&domain->link); @@ -223,20 +215,11 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int *revmap; - revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, - of_node_to_nid(of_node)); - if (WARN_ON(!revmap)) + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + if (!domain) return NULL; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); - if (!domain) { - kfree(revmap); - return NULL; - } - domain->revmap_data.linear.size = size; - domain->revmap_data.linear.revmap = revmap; irq_domain_add(domain); return domain; } @@ -248,7 +231,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); if (domain) { domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); @@ -257,28 +240,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_nomap); -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_TREE, ops, host_data); - if (domain) { - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller @@ -359,17 +320,13 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->domain = NULL; irq_data->hwirq = 0; - /* Clear reverse map */ - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = 0; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); - break; } } } @@ -421,16 +378,12 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = virq; - break; - case IRQ_DOMAIN_MAP_TREE: + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = virq; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); - break; } irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -667,13 +620,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, switch (domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_TREE: - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - if (data) - return data->irq; - break; case IRQ_DOMAIN_MAP_NOMAP: data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) @@ -696,13 +642,18 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct irq_data *data; BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) - return 0; + if (hwirq >= domain->revmap_data.linear.size) { + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; + } - return domain->revmap_data.linear.revmap[hwirq]; + return domain->linear_revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); -- cgit v1.2.3 From 1aa0dd94ca07df818cf14588c9031ab1d7fd84d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:03:59 +0100 Subject: irqdomain: Eliminate revmap type The NOMAP irq_domain type is only used by a handful of interrupt controllers and it unnecessarily complicates the code by adding special cases on how to look up mappings and different revmap functions are used for each type which need to validate the correct type is passed to it before performing the reverse map. Eliminating the revmap_type and making a single reverse mapping function simplifies the code. It also shouldn't be any slower than having separate revmap functions because the type of the revmap needed to be checked anyway. The linear and tree revmap types were already merged in a previous patch. This patch rolls the NOMAP or direct mapping behaviour into the same domain code making is possible for an irq domain to do any mapping type; linear, tree or direct; and that the mapping will be transparent to the interrupt controller driver. With this change, direct mappings will get stored in the linear or tree mapping for consistency. Reverse mapping from the hwirq to virq will go through the normal lookup process. However, any controller using a direct mapping can take advantage of knowing that hwirq==virq for any mapped interrupts skip doing a revmap lookup when handling IRQs. Signed-off-by: Grant Likely --- kernel/irq/generic-chip.c | 5 +---- kernel/irq/irqdomain.c | 55 +++++++++++++++++++---------------------------- 2 files changed, 23 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ca98cc5d6308..4b011064e146 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -270,10 +270,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->gc) return -EBUSY; - if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) - return -EINVAL; - - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = d->revmap_size / irqs_per_chip; if (!numchips) return -EINVAL; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a1d8ec8509e..c38be78fceb4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,7 +25,6 @@ static struct irq_domain *irq_default_domain; /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -34,7 +33,7 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, int size, + int size, const struct irq_domain_ops *ops, void *host_data) { @@ -46,12 +45,11 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, return NULL; /* Fill structure */ - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - domain->revmap_type = revmap_type; + INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); - domain->revmap_data.linear.size = size; + domain->revmap_size = size; return domain; } @@ -67,8 +65,7 @@ static void irq_domain_add(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("Allocated domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Added domain %s\n", domain->name); } /** @@ -88,7 +85,7 @@ void irq_domain_remove(struct irq_domain *domain) * node when all entries are removed. Shout if there are * any mappings left. */ - WARN_ON(domain->revmap_data.tree.height); + WARN_ON(domain->revmap_tree.height); list_del(&domain->link); @@ -100,8 +97,7 @@ void irq_domain_remove(struct irq_domain *domain) mutex_unlock(&irq_domain_mutex); - pr_debug("Removed domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } @@ -216,7 +212,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, { struct irq_domain *domain; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + domain = irq_domain_alloc(of_node, size, ops, host_data); if (!domain) return NULL; @@ -230,10 +226,9 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); + struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); if (domain) { - domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; + domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); } return domain; @@ -321,11 +316,11 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->hwirq = 0; /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_tree, hwirq); mutex_unlock(&revmap_trees_mutex); } } @@ -378,11 +373,11 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } @@ -399,7 +394,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { @@ -408,17 +405,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) - return 0; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_data.nomap.max_irq) { + if (virq >= domain->revmap_direct_max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_data.nomap.max_irq); + domain->revmap_direct_max_irq); irq_free_desc(virq); return 0; } @@ -617,17 +611,13 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_NOMAP: + if (hwirq < domain->revmap_direct_max_irq) { data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) return hwirq; - break; } - return 0; + return irq_linear_revmap(domain, hwirq); } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -643,12 +633,11 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *data; - BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_data.linear.size) { + if (hwirq >= domain->revmap_size) { rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); rcu_read_unlock(); return data ? data->irq : 0; } -- cgit v1.2.3 From fa40f377577752b83252b9d2b3165d4acee0eb7c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:57:40 +0100 Subject: irqdomain: Clean up aftermath of irq_domain refactoring After refactoring the irqdomain code, there are a number of API functions that are merely empty wrappers around core code. Drop those wrappers out of the C file and replace them with static inlines in the header. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 127 ++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c38be78fceb4..e0db59e2eef6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -32,10 +35,10 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - int size, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; @@ -50,23 +53,16 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); domain->revmap_size = size; + domain->revmap_direct_max_irq = direct_max; - return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ - of_node_put(domain->of_node); - kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{ mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); + pr_debug("Added domain %s\n", domain->name); + return domain; } +EXPORT_SYMBOL_GPL(__irq_domain_add); /** * irq_domain_remove() - Remove an irq domain. @@ -99,30 +95,28 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - irq_domain_free(domain); + of_node_put(domain->of_node); + kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, - * pass zero to assign irqs on-the-fly. This will result in a - * linear IRQ domain so it is important to use irq_create_mapping() - * for each used IRQ, especially when SPARSE_IRQ is enabled. + * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, @@ -130,33 +124,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) { - int irq_base; + struct irq_domain *domain; + + domain = __irq_domain_add(of_node, size, 0, ops, host_data); + if (!domain) + return NULL; + if (first_irq > 0) { if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* - * Set the descriptor allocator to search for a - * 1-to-1 mapping, such as irq_alloc_desc_at(). - * Use of_node_to_nid() which is defined to - * numa_node_id() on platforms that have no custom - * implementation. - */ - irq_base = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); - if (irq_base < 0) { + /* attempt to allocated irq_descs */ + int rc = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); - irq_base = first_irq; - } - } else - irq_base = first_irq; - - return irq_domain_add_legacy(of_node, size, irq_base, 0, - ops, host_data); + } + WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); } - /* A linear domain is the default */ - return irq_domain_add_linear(of_node, size, ops, host_data); + return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -184,11 +170,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", - first_irq, first_irq + size - 1, - (int)first_hwirq, (int)first_hwirq + size -1); - - domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; @@ -198,43 +180,6 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); -/** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain; - - domain = irq_domain_alloc(of_node, size, ops, host_data); - if (!domain) - return NULL; - - irq_domain_add(domain); - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); - if (domain) { - domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller -- cgit v1.2.3 From 1400ea86025a22862f97e7fe544433751b43ecec Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 22:20:44 +0100 Subject: irqdomain: Beef up debugfs output This patch increases the amount of output produced by the irq_domain_mapping debugfs file by first listing all of the registered irq domains at the beginning of the output, and then by including all mapped IRQs in the output, not just the active ones. It is very useful when debugging irqdomain issues to be able to see the entire list of mapped irqs, not just the ones that happen to be connected to devices. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0db59e2eef6..280b8047d8db 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -596,12 +596,29 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - void *data; + struct irq_domain *domain; + struct radix_tree_iter iter; + void *data, **slot; int i; - seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", + "name", "mapped", "linear-max", "direct-max", "devtree-node"); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, link) { + int count = 0; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) + count++; + seq_printf(m, "%c%-16s %6u %10u %10u %s\n", + domain == irq_default_domain ? '*' : ' ', domain->name, + domain->revmap_size + count, domain->revmap_size, + domain->revmap_direct_max_irq, + domain->of_node ? of_node_full_name(domain->of_node) : ""); + } + mutex_unlock(&irq_domain_mutex); + + seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "domain name"); + "active", "type", "domain"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -609,12 +626,15 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); + domain = desc->irq_data.domain; - if (desc->action && desc->action->handler) { + if (domain) { struct irq_chip *chip; + int hwirq = desc->irq_data.hwirq; + bool direct; seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + seq_printf(m, "0x%05x ", hwirq); chip = irq_desc_get_chip(desc); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); @@ -622,6 +642,11 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); seq_printf(m, "%s\n", desc->irq_data.domain->name); } -- cgit v1.2.3 From ad71d889b88055e61e3970a6744a271a51a94f42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 15:46:14 -0400 Subject: tracing: Add function probe to trigger a ftrace dump to console Add the "dump" command to have the ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. Format is: :dump echo 'bad_address:dump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:dump' > /debug/tracing/set_ftrace_filter Requested-by: Luis Claudio R. Goncalves Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 59 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d7191988..d7c8719734b8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,6 +290,13 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) trace_dump_stack(STACK_SKIP); } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -327,6 +334,13 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("stacktrace", m, ip, data); } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -342,6 +356,11 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { .print = ftrace_stacktrace_print, }; +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -425,6 +444,19 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, param, enable); } +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -440,6 +472,11 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { .func = ftrace_stacktrace_callback, }; +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -450,13 +487,25 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; ret = register_ftrace_command(&ftrace_stacktrace_cmd); - if (ret) { - unregister_ftrace_command(&ftrace_traceoff_cmd); - unregister_ftrace_command(&ftrace_traceon_cmd); - } + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + return 0; + + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else -- cgit v1.2.3 From 90e3c03c3a09a7b176b3fe59d78f5d9755ac8e37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 19:00:46 -0400 Subject: tracing: Add function probe to trigger a ftrace dump of current CPU trace Add the "cpudump" command to have the current CPU ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. This differs from the "dump" command as it only dumps the content of the ring buffer for the currently executing CPU, and does not show the contents of the other CPUs. Format is: :cpudump echo 'bad_address:cpudump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:cpudump' > /debug/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d7c8719734b8..b863f93b30f3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -297,6 +297,14 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) ftrace_dump(DUMP_ALL); } +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -341,6 +349,13 @@ ftrace_dump_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("dump", m, ip, data); } +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -361,6 +376,11 @@ static struct ftrace_probe_ops dump_probe_ops = { .print = ftrace_dump_print, }; +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -457,6 +477,19 @@ ftrace_dump_callback(struct ftrace_hash *hash, "1", enable); } +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -477,6 +510,11 @@ static struct ftrace_func_command ftrace_dump_cmd = { .func = ftrace_dump_callback, }; +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -497,8 +535,14 @@ static int __init init_func_cmd_traceon(void) if (ret) goto out_free_stacktrace; + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + return 0; + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); out_free_stacktrace: unregister_ftrace_command(&ftrace_stacktrace_cmd); out_free_traceon: -- cgit v1.2.3 From 8092e808a31839c502a52d391b15f31c1d8764f5 Mon Sep 17 00:00:00 2001 From: Harsh Prateek Bora Date: Fri, 24 May 2013 12:52:17 +0530 Subject: tracing/trivial: Consolidate error return condition Consolidate the checks for !enabled and !param to return -EINVAL in event_enable_func(). Link: http://lkml.kernel.org/r/1369380137-12452-1-git-send-email-harsh@linux.vnet.ibm.com Signed-off-by: Harsh Prateek Bora Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4bf..db086f172cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2011,10 +2011,7 @@ event_enable_func(struct ftrace_hash *hash, int ret; /* hash funcs only work with set_ftrace_filter */ - if (!enabled) - return -EINVAL; - - if (!param) + if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); -- cgit v1.2.3 From 238ae93d699d59876b470bf6455de22bcfaa9a1b Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Sun, 26 May 2013 16:52:01 +0800 Subject: tracing: Fix file mode of free_buffer Commit 4f271a2a60c748599b30bb4dafff30d770439b96 (tracing: Add a proc file to stop tracing and free buffer) implement a method to free up ring buffer in kernel memory in the release code path of free_buffer's fd. Then we don't need read/write support for free_buffer, indeed we just have a dummy write fop, and don't implement read fop. So the 0200 is more reasonable file mode for free_buffer than the current file mode 0644. Link: http://lkml.kernel.org/r/20130526085201.GA3183@udknight Acked-by: Vaibhav Nagarnaik Acked-by: David Sharp Signed-off-by: Wang YanQing Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..5f4a09c12e0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5935,7 +5935,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, -- cgit v1.2.3 From 7614c3dc74733dff4b0e774f7a894b9ea6ec508c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 20:01:16 -0400 Subject: ftrace: Use schedule_on_each_cpu() as a heavy synchronize_sched() The function tracer uses preempt_disable/enable_notrace() for synchronization between reading registered ftrace_ops and unregistering them. Most of the ftrace_ops are global permanent structures that do not require this synchronization. That is, ops may be added and removed from the hlist but are never freed, and wont hurt if a synchronization is missed. But this is not true for dynamically created ftrace_ops or control_ops, which are used by the perf function tracing. The problem here is that the function tracer can be used to trace kernel/user context switches as well as going to and from idle. Basically, it can be used to trace blind spots of the RCU subsystem. This means that even though preempt_disable() is done, a synchronize_sched() will ignore CPUs that haven't made it out of user space or idle. These can include functions that are being traced just before entering or exiting the kernel sections. To implement the RCU synchronization, instead of using synchronize_sched() the use of schedule_on_each_cpu() is performed. This means that when a dynamically allocated ftrace_ops, or a control ops is being unregistered, all CPUs must be touched and execute a ftrace_sync() stub function via the work queues. This will rip CPUs out from idle or in dynamic tick mode. This only happens when a user disables perf function tracing or other dynamically allocated function tracers, but it allows us to continue to debug RCU and context tracking with function tracing. Link: http://lkml.kernel.org/r/1369785676.15552.55.camel@gandalf.local.home Cc: "Paul E. McKenney" Cc: Tejun Heo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..800a8a2fbddb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } -- cgit v1.2.3 From aaf6ac0f0871cb7fc0f28f3a00edf329bc7adc29 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 7 Jun 2013 15:07:48 +0900 Subject: tracing: Do not call kmem_cache_free() on allocation failure There's no point calling it when _alloc() failed. Link: http://lkml.kernel.org/r/1370585268-29169-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db086f172cf5..f57b01574a30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,7 +97,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; + return -ENOMEM; field->name = name; field->type = type; @@ -114,11 +114,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - kmem_cache_free(field_cachep, field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, -- cgit v1.2.3 From 11682a41618f8094cb7a9330b4b6a12ffaef5774 Mon Sep 17 00:00:00 2001 From: Marcus Gelderie Date: Tue, 4 Jun 2013 09:32:09 +0200 Subject: alarmtimer: Export symbols of functions declared in linux/alarmtimer.h Export symbols so they can be used by drivers/staging/android/alarm-dev.c if it is built as a module. So far alarm-dev is built-in but module support is planned (see drivers/staging/android/TODO). Signed-off-by: Marcus Gelderie [jstultz: tweaked commit message, also export newly added functions] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 3e5cba274475..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -204,6 +204,7 @@ ktime_t alarm_expires_remaining(const struct alarm *alarm) struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->gettime()); } +EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** @@ -309,6 +310,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire @@ -329,6 +331,7 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire @@ -342,6 +345,7 @@ int alarm_start_relative(struct alarm *alarm, ktime_t start) start = ktime_add(start, base->gettime()); return alarm_start(alarm, start); } +EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { @@ -354,6 +358,7 @@ void alarm_restart(struct alarm *alarm) alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -375,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -392,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -424,6 +431,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { @@ -431,7 +439,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) return alarm_forward(alarm, base->gettime(), interval); } - +EXPORT_SYMBOL_GPL(alarm_forward_now); /** -- cgit v1.2.3 From 38ff87f77af0b5a93fc8581cff1d6e5692ab8970 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 1 Jun 2013 23:39:40 -0700 Subject: sched_clock: Make ARM's sched_clock generic for all architectures Nothing about the sched_clock implementation in the ARM port is specific to the architecture. Generalize the code so that other architectures can use it by selecting GENERIC_SCHED_CLOCK. Signed-off-by: Stephen Boyd [jstultz: Merge minor collisions with other patches in my tree] Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/sched_clock.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 kernel/time/sched_clock.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index d52ac8bf0006..9250130646f5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -4,6 +4,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000000..aad1ae6077ef --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,215 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct clock_data { + u64 epoch_ns; + u32 epoch_cyc; + u32 epoch_cyc_copy; + unsigned long rate; + u32 mult; + u32 shift; + bool suspended; +}; + +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { + .mult = NSEC_PER_SEC / HZ, +}; + +static u32 __read_mostly sched_clock_mask = 0xffffffff; + +static u32 notrace jiffy_sched_clock_read(void) +{ + return (u32)(jiffies - INITIAL_JIFFIES); +} + +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +{ + u64 epoch_ns; + u32 epoch_cyc; + + /* + * Load the epoch_cyc and epoch_ns atomically. We do this by + * ensuring that we always write epoch_cyc, epoch_ns and + * epoch_cyc_copy in strict order, and read them in strict order. + * If epoch_cyc and epoch_cyc_copy are not equal, then we're in + * the middle of an update, and we should repeat the load. + */ + do { + epoch_cyc = cd.epoch_cyc; + smp_rmb(); + epoch_ns = cd.epoch_ns; + smp_rmb(); + } while (epoch_cyc != cd.epoch_cyc_copy); + + return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ + unsigned long flags; + u32 cyc; + u64 ns; + + cyc = read_sched_clock(); + ns = cd.epoch_ns + + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + /* + * Write epoch_cyc and epoch_ns in a way that the update is + * detectable in cyc_to_fixed_sched_clock(). + */ + raw_local_irq_save(flags); + cd.epoch_cyc_copy = cyc; + smp_wmb(); + cd.epoch_ns = ns; + smp_wmb(); + cd.epoch_cyc = cyc; + raw_local_irq_restore(flags); +} + +static void sched_clock_poll(unsigned long wrap_ticks) +{ + mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); + update_sched_clock(); +} + +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +{ + unsigned long r, w; + u64 res, wrap; + char r_unit; + + if (cd.rate > rate) + return; + + BUG_ON(bits > 32); + WARN_ON(!irqs_disabled()); + read_sched_clock = read; + sched_clock_mask = (1 << bits) - 1; + cd.rate = rate; + + /* calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else + r_unit = ' '; + + /* calculate how many ns until we wrap */ + wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); + do_div(wrap, NSEC_PER_MSEC); + w = wrap; + + /* calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, cd.mult, cd.shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", + bits, r, r_unit, res, w); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); + update_sched_clock(); + + /* + * Ensure that sched_clock() starts off at 0ns + */ + cd.epoch_ns = 0; + + /* Enable IRQ time accounting if we have a fast enough sched_clock */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +static unsigned long long notrace sched_clock_32(void) +{ + u32 cyc = read_sched_clock(); + return cyc_to_sched_clock(cyc, sched_clock_mask); +} + +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) +{ + if (cd.suspended) + return cd.epoch_ns; + + return sched_clock_func(); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock function has been provided at that point, + * make it the final one one. + */ + if (read_sched_clock == jiffy_sched_clock_read) + setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + + sched_clock_poll(sched_clock_timer.data); +} + +static int sched_clock_suspend(void) +{ + sched_clock_poll(sched_clock_timer.data); + cd.suspended = true; + return 0; +} + +static void sched_clock_resume(void) +{ + cd.epoch_cyc = read_sched_clock(); + cd.epoch_cyc_copy = cd.epoch_cyc; + cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + return 0; +} +device_initcall(sched_clock_syscore_init); -- cgit v1.2.3 From c5cdc67a58a22c49f558b450c6f748251ceb2e7b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 13 Jun 2013 22:19:43 +0000 Subject: irqdomain: Remove temporary MIPS workaround code The MIPS interrupt controllers are all registering their own irq_domains now. Drop the MIPS specific code because it is no longer needed. Signed-off-by: Grant Likely Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/5458/ Signed-off-by: Ralf Baechle --- kernel/irq/irqdomain.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 54a4d5223238..a341b3d433ad 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -665,18 +665,6 @@ unsigned int irq_create_of_mapping(struct device_node *controller, domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { -#ifdef CONFIG_MIPS - /* - * Workaround to avoid breaking interrupt controller drivers - * that don't yet register an irq_domain. This is temporary - * code. ~~~gcl, Feb 24, 2012 - * - * Scheduled for removal in Linux v3.6. That should be enough - * time. - */ - if (intsize > 0) - return intspec[0]; -#endif pr_warning("no irq domain found for %s !\n", of_node_full_name(controller)); return 0; -- cgit v1.2.3 From 336ae1180df5f69b9e0fb6561bec01c5f64361cf Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Jun 2013 15:40:58 -0700 Subject: ARM: sched_clock: Load cycle count after epoch stabilizes There is a small race between when the cycle count is read from the hardware and when the epoch stabilizes. Consider this scenario: CPU0 CPU1 ---- ---- cyc = read_sched_clock() cyc_to_sched_clock() update_sched_clock() ... cd.epoch_cyc = cyc; epoch_cyc = cd.epoch_cyc; ... epoch_ns + cyc_to_ns((cyc - epoch_cyc) The cyc on cpu0 was read before the epoch changed. But we calculate the nanoseconds based on the new epoch by subtracting the new epoch from the old cycle count. Since epoch is most likely larger than the old cycle count we calculate a large number that will be converted to nanoseconds and added to epoch_ns, causing time to jump forward too much. Fix this problem by reading the hardware after the epoch has stabilized. Cc: Russell King Signed-off-by: Stephen Boyd Signed-off-by: John Stultz --- kernel/time/sched_clock.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index aad1ae6077ef..a326f27d7f09 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,10 +49,14 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +static unsigned long long notrace sched_clock_32(void) { u64 epoch_ns; u32 epoch_cyc; + u32 cyc; + + if (cd.suspended) + return cd.epoch_ns; /* * Load the epoch_cyc and epoch_ns atomically. We do this by @@ -68,7 +72,9 @@ static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) smp_rmb(); } while (epoch_cyc != cd.epoch_cyc_copy); - return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); + cyc = read_sched_clock(); + cyc = (cyc - epoch_cyc) & sched_clock_mask; + return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); } /* @@ -160,19 +166,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -static unsigned long long notrace sched_clock_32(void) -{ - u32 cyc = read_sched_clock(); - return cyc_to_sched_clock(cyc, sched_clock_mask); -} - unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; unsigned long long notrace sched_clock(void) { - if (cd.suspended) - return cd.epoch_ns; - return sched_clock_func(); } -- cgit v1.2.3 From e12d0271774fea9fddf1e2a7952a0bffb2ee8e8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 May 2013 17:12:28 -0400 Subject: nohz: Warn if the machine can not perform nohz_full If the user configures NO_HZ_FULL and defines nohz_full=XXX on the kernel command line, or enables NO_HZ_FULL_ALL, but nohz fails due to the machine having a unstable clock, warn about it. We do not want users thinking that they are getting the benefit of nohz when their machine can not support it. Signed-off-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..d87d22cb9bf2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -178,6 +178,11 @@ static bool can_stop_full_tick(void) */ if (!sched_clock_stable) { trace_tick_stop(0, "unstable sched clock\n"); + /* + * Don't allow the user to think they can get + * full NO_HZ with this machine. + */ + WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif -- cgit v1.2.3 From b8900bc0217fac8e68085997bee2f05e6db931a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 Jun 2013 15:42:53 +0200 Subject: watchdog: Register / unregister watchdog kthreads on sysctl control The user activation/deactivation of the watchdog through boot parameters or systcl is currently implemented with a dance involving kthreads parking and unparking methods: the threads are unconditionally registered on boot and they park as soon as the user want the watchdog to be disabled. This method involves a few noisy details to handle though: the watchdog kthreads may be unparked anytime due to hotplug operations, after which the watchdog internals have to decide to park again if it is user-disabled. As a result the setup() and unpark() methods need to be able to request a reparking. This is not currently supported in the kthread infrastructure so this piece of the watchdog code only works halfway. Besides, unparking/reparking the watchdog kthreads consume unnecessary cputime on hotplug operations when those could be simply ignored in the first place. As suggested by Srivatsa, let's instead only register the watchdog threads when they are needed. This way we don't need to think about hotplug operations and we don't burden the CPU onlining when the watchdog is simply disabled. Suggested-by: Srivatsa S. Bhat Signed-off-by: Frederic Weisbecker Cc: Srivatsa S. Bhat Cc: Anish Singh Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Don Zickus --- kernel/watchdog.c | 87 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..52c9a9b91bdd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,7 +31,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled; +static int __read_mostly watchdog_disabled = 1; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - if (!watchdog_enabled) { - kthread_park(current); - return; - } - /* Enable the perf event */ watchdog_nmi_enable(cpu); @@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); } +static void watchdog_cleanup(unsigned int cpu, bool online) +{ + watchdog_disable(cpu); +} + static int watchdog_should_run(unsigned int cpu) { return __this_cpu_read(hrtimer_interrupts) != @@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL -static void watchdog_enable_all_cpus(void) +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .cleanup = watchdog_cleanup, + .park = watchdog_disable, + .unpark = watchdog_enable, +}; + +static int watchdog_enable_all_cpus(void) { - unsigned int cpu; + int err = 0; if (watchdog_disabled) { - watchdog_disabled = 0; - for_each_online_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + err = smpboot_register_percpu_thread(&watchdog_threads); + if (err) + pr_err("Failed to create watchdog threads, disabled\n"); + else + watchdog_disabled = 0; } + + return err; } +/* prepare/enable/disable routines */ +/* sysctl functions */ +#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { - unsigned int cpu; - if (!watchdog_disabled) { watchdog_disabled = 1; - for_each_online_cpu(cpu) - kthread_park(per_cpu(softlockup_watchdog, cpu)); + smpboot_unregister_percpu_thread(&watchdog_threads); } } @@ -507,14 +519,14 @@ static void watchdog_disable_all_cpus(void) int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; + int err, old_thresh, old_enabled; - if (watchdog_disabled < 0) - return -ENODEV; + old_thresh = ACCESS_ONCE(watchdog_thresh); + old_enabled = ACCESS_ONCE(watchdog_enabled); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err || !write) + return err; set_sample_period(); /* @@ -523,29 +535,24 @@ int proc_dowatchdog(struct ctl_table *table, int write, * watchdog_*_all_cpus() function takes care of this. */ if (watchdog_enabled && watchdog_thresh) - watchdog_enable_all_cpus(); + err = watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); - return ret; + /* Restore old values on failure */ + if (err) { + watchdog_thresh = old_thresh; + watchdog_enabled = old_enabled; + } + + return err; } #endif /* CONFIG_SYSCTL */ -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - void __init lockup_detector_init(void) { set_sample_period(); - if (smpboot_register_percpu_thread(&watchdog_threads)) { - pr_err("Failed to create watchdog threads, disabled\n"); - watchdog_disabled = -ENODEV; - } + + if (watchdog_enabled) + watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 1a891cf19cdfb645827969cc6aeaeebdefeb87b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Jun 2013 13:16:25 -0400 Subject: tracing: Add binary '&' filter for events There are some cases when filtering on a set flag of a field of a tracepoint is useful. But currently the only filtering commands for numbered fields is ==, !=, <, <=, >, >=. This does not help when you just want to trace if a specific flag is set. For example: > # sudo trace-cmd record -e brcmfmac:brcmf_dbg -f 'level & 0x40000' > disable all > enable brcmfmac:brcmf_dbg > path = /sys/kernel/debug/tracing/events/brcmfmac/brcmf_dbg/enable > (level & 0x40000) > ^ > parse_error: Invalid operator > When trying to trace brcmf_dbg when level has its 1 << 18 bit set, the filter fails to perform. By allowing a binary '&' operation, this gives the user the ability to test a bit. Note, a binary '|' is not added, as it doesn't make sense as fields must be compared to constants (for now), and ORing a constant will always return true. Link: http://lkml.kernel.org/r/1371057385.9844.261.camel@gandalf.local.home Suggested-by: Arend van Spriel Tested-by: Arend van Spriel Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1b653f7e1ca..0d883dc057d6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -44,6 +44,7 @@ enum filter_op_ids OP_LE, OP_GT, OP_GE, + OP_BAND, OP_NONE, OP_OPEN_PAREN, }; @@ -54,6 +55,7 @@ struct filter_op { int precedence; }; +/* Order must be the same as enum filter_op_ids above */ static struct filter_op filter_ops[] = { { OP_OR, "||", 1 }, { OP_AND, "&&", 2 }, @@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = { { OP_LE, "<=", 5 }, { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, + { OP_BAND, "&", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ case OP_GE: \ match = (*addr >= val); \ break; \ + case OP_BAND: \ + match = (*addr & val); \ + break; \ default: \ break; \ } \ -- cgit v1.2.3 From de7edd31457b626e54a0b2a7e8ff4d65492f01ad Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 14 Jun 2013 16:21:43 -0400 Subject: tracing: Disable tracing on warning Add a traceoff_on_warning option in both the kernel command line as well as a sysctl option. When set, any WARN*() function that is hit will cause the tracing_on variable to be cleared, which disables writing to the ring buffer. This is useful especially when tracing a bug with function tracing. When a warning is hit, the print caused by the warning can flood the trace with the functions that producing the output for the warning. This can make the resulting trace useless by either hiding where the bug happened, or worse, by overflowing the buffer and losing the trace of the bug totally. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/panic.c | 3 +++ kernel/sysctl.c | 7 +++++++ kernel/trace/trace.c | 17 +++++++++++++++++ 3 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..4cea6cc628ab 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -399,6 +400,8 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { + disable_trace_on_warning(); + printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..5b0f18c12800 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -600,6 +600,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_MODULES { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f4a09c12e0b..c4c9296b1916 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask; enum ftrace_dump_mode ftrace_dump_on_oops; +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + static int tracing_set_tracer(const char *buf); #define MAX_TRACER_SIZE 100 @@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); + static int __init boot_alloc_snapshot(char *str) { allocate_snapshot = true; @@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -562,6 +573,12 @@ void tracing_off(void) } EXPORT_SYMBOL_GPL(tracing_off); +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) + tracing_off(); +} + /** * tracing_is_on - show state of ring buffers enabled */ -- cgit v1.2.3 From 195a84d91e92ee3fe571a2086a6db7e17bf5bc7c Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Fri, 14 Jun 2013 10:10:38 +0800 Subject: tracing/kprobes: Remove unnecessary checking of trace_probe_is_enabled Since tp->flags assignment was moved into function enable_trace_probe(), there is no need to use trace_probe_is_enabled to check flags in the same function. Remove the unnecessary checking. Link: http://lkml.kernel.org/r/51BA7B9E.3040807@huawei.com Acked-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9f46e98ba8f2..f2374172ba7b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -240,8 +240,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } else tp->flags |= TP_FLAG_PROFILE; - if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && - !trace_probe_has_gone(tp)) { + if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else -- cgit v1.2.3 From 52d85d763086594f139bf7d3a5641abeb91d9f57 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 12 Jun 2013 12:03:18 +0200 Subject: ftrace: Fix stddev calculation in function profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When FUNCTION_GRAPH_TRACER is enabled, ftrace can profile kernel functions and print basic statistics about them. Unfortunately, running stddev calculation is wrong. This patch corrects it implementing Welford’s method: s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) . Link: http://lkml.kernel.org/r/1371031398-24048-1-git-send-email-juri.lelli@gmail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 800a8a2fbddb..26e19105cdcc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -641,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v) if (rec->counter <= 1) stddev = 0; else { - stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + /* * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, (rec->counter - 1) * 1000); + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); -- cgit v1.2.3 From 3c00ea82c724fab0b98f15428a804cb45eb9ad38 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 May 2013 20:45:15 +0200 Subject: watchdog: Rename confusing state variable We have two very conflicting state variable names in the watchdog: * watchdog_enabled: This one reflects the user interface. It's set to 1 by default and can be overriden with boot options or sysctl/procfs interface. * watchdog_disabled: This is the internal toggle state that tells if watchdog threads, timers and NMI events are currently running or not. This state mostly depends on the user settings. It's a convenient state latch. Now we really need to find clearer names because those are just too confusing to encourage deep review. watchdog_enabled now becomes watchdog_user_enabled to reflect its purpose as an interface. watchdog_disabled becomes watchdog_running to suggest its role as a pure internal state. Signed-off-by: Frederic Weisbecker Cc: Srivatsa S. Bhat Cc: Anish Singh Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Don Zickus --- kernel/sysctl.c | 4 ++-- kernel/watchdog.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..b0805652c4ff 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -801,7 +801,7 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, @@ -828,7 +828,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 52c9a9b91bdd..51c4f34d258e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,9 +29,9 @@ #include #include -int watchdog_enabled = 1; +int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled = 1; +static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); @@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_enabled) { + if (watchdog_user_enabled) { unsigned cpu; for_each_present_cpu(cpu) { @@ -490,12 +490,12 @@ static int watchdog_enable_all_cpus(void) { int err = 0; - if (watchdog_disabled) { + if (!watchdog_running) { err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled\n"); else - watchdog_disabled = 0; + watchdog_running = 1; } return err; @@ -506,8 +506,8 @@ static int watchdog_enable_all_cpus(void) #ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { - if (!watchdog_disabled) { - watchdog_disabled = 1; + if (watchdog_running) { + watchdog_running = 0; smpboot_unregister_percpu_thread(&watchdog_threads); } } @@ -522,7 +522,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, int err, old_thresh, old_enabled; old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_enabled); + old_enabled = ACCESS_ONCE(watchdog_user_enabled); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err || !write) @@ -531,10 +531,10 @@ int proc_dowatchdog(struct ctl_table *table, int write, set_sample_period(); /* * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_disabled' variable check in + * disabled. The 'watchdog_running' variable check in * watchdog_*_all_cpus() function takes care of this. */ - if (watchdog_enabled && watchdog_thresh) + if (watchdog_user_enabled && watchdog_thresh) err = watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); @@ -542,7 +542,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, /* Restore old values on failure */ if (err) { watchdog_thresh = old_thresh; - watchdog_enabled = old_enabled; + watchdog_user_enabled = old_enabled; } return err; @@ -553,6 +553,6 @@ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_enabled) + if (watchdog_user_enabled) watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 940be35ac0139530d7554aa2352a8388e3d4adca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Jun 2013 13:35:42 +0200 Subject: watchdog: Boot-disable by default on full dynticks When the watchdog runs, it prevents the full dynticks CPUs from stopping their tick because the hard lockup detector uses perf events internally, which in turn rely on the periodic tick. Since this is a rather confusing behaviour that is not easy to track down and identify for those who want to test CONFIG_NO_HZ_FULL, let's default disable the watchdog on boot time when full dynticks is enabled. The user can still enable it later on runtime using proc or sysctl. Reported-by: Steven Rostedt Suggested-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Li Zhong Cc: Don Zickus Cc: Srivatsa S. Bhat Cc: Anish Singh --- kernel/watchdog.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51c4f34d258e..1241d8c91d5e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -553,6 +553,14 @@ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (watchdog_user_enabled) { + watchdog_user_enabled = 0; + pr_warning("Disabled lockup detectors by default for full dynticks\n"); + pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); + } +#endif + if (watchdog_user_enabled) watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 5b8621a68fdcd2baf1d3b413726f913a5254d46a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Jun 2013 13:47:31 +0200 Subject: nohz: Remove obsolete check for full dynticks CPUs to be RCU nocbs Building full dynticks now implies that all CPUs are forced into RCU nocb mode through CONFIG_RCU_NOCB_CPU_ALL. The dynamic check has become useless. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Li Zhong Cc: Borislav Petkov --- kernel/time/tick-sched.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d87d22cb9bf2..b15750139260 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,6 @@ void __init tick_nohz_init(void) } cpu_notifier(tick_nohz_cpu_down_callback, 0); - - /* Make sure full dynticks CPU are also RCU nocbs */ - for_each_cpu(cpu, nohz_full_mask) { - if (!rcu_is_nocb_cpu(cpu)) { - pr_warning("NO_HZ: CPU %d is not RCU nocb: " - "cleared from nohz_full range", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); - } - } - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } -- cgit v1.2.3 From ddaf144c61da45ae5c49ae38556c3ac4524f9318 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 01:06:02 +0100 Subject: irqdomain: Refactor irq_domain_associate_many() Originally, irq_domain_associate_many() was designed to unwind the mapped irqs on a failure of any individual association. However, that proved to be a problem with certain IRQ controllers. Some of them only support a subset of irqs, and will fail when attempting to map a reserved IRQ. In those cases we want to map as many IRQs as possible, so instead it is better for irq_domain_associate_many() to make a best-effort attempt to map irqs, but not fail if any or all of them don't succeed. If a caller really cares about how many irqs got associated, then it should instead go back and check that all of the irqs is cares about were mapped. The original design open-coded the individual association code into the body of irq_domain_associate_many(), but with no longer needing to unwind associations, the code becomes simpler to split out irq_domain_associate() to contain the bulk of the logic, and irq_domain_associate_many() to be a simple loop wrapper. This patch also adds a new error check to the associate path to make sure it isn't called for an irq larger than the controller can handle, and adds locking so that the irq_domain_mutex is held while setting up a new association. v3: Fixup missing change to irq_domain_add_tree() v2: Fixup x86 warning. irq_domain_associate_many() no longer returns an error code, but reports errors to the printk log directly. In the majority of cases we don't actually want to fail if there is a problem, but rather log it and still try to boot the system. Signed-off-by: Grant Likely irqdomain: Fix flubbed irq_domain_associate_many refactoring commit d39046ec72, "irqdomain: Refactor irq_domain_associate_many()" was missing the following hunk which causes a boot failure on anything using irq_domain_add_tree() to allocate an irq domain. Signed-off-by: Grant Likely Cc: Michael Neuling Cc: Benjamin Herrenschmidt , Cc: Thomas Gleixner , Cc: Stephen Rothwell --- kernel/irq/irqdomain.c | 185 ++++++++++++++++++++++++------------------------- 1 file changed, 91 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 280b8047d8db..80e92492c77b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -35,8 +35,8 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct device_node *of_node, - int size, int direct_max, +struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, + irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) { @@ -52,6 +52,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -126,7 +127,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, size, 0, ops, host_data); + domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); if (!domain) return NULL; @@ -139,7 +140,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); } - WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); + irq_domain_associate_many(domain, first_irq, 0, size); } return domain; @@ -170,11 +171,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, + first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; - WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } @@ -228,109 +230,109 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate_many(struct irq_domain *domain, - unsigned int irq_base, int count) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { - /* - * disassociate in reverse order; - * not strictly necessary, but nice for unwinding - */ - while (count--) { - int irq = irq_base + count; - struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq; + struct irq_data *irq_data = irq_get_irq_data(irq); + irq_hw_number_t hwirq; - if (WARN_ON(!irq_data || irq_data->domain != domain)) - continue; + if (WARN(!irq_data || irq_data->domain != domain, + "virq%i doesn't exist; cannot disassociate\n", irq)) + return; - hwirq = irq_data->hwirq; - irq_set_status_flags(irq, IRQ_NOREQUEST); + hwirq = irq_data->hwirq; + irq_set_status_flags(irq, IRQ_NOREQUEST); - /* remove chip and handler */ - irq_set_chip_and_handler(irq, NULL, NULL); + /* remove chip and handler */ + irq_set_chip_and_handler(irq, NULL, NULL); - /* Make sure it's completed */ - synchronize_irq(irq); + /* Make sure it's completed */ + synchronize_irq(irq); - /* Tell the PIC about it */ - if (domain->ops->unmap) - domain->ops->unmap(domain, irq); - smp_mb(); + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, irq); + smp_mb(); - irq_data->domain = NULL; - irq_data->hwirq = 0; + irq_data->domain = NULL; + irq_data->hwirq = 0; - /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = 0; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - } + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); } } -int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, - irq_hw_number_t hwirq_base, int count) +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - unsigned int virq = irq_base; - irq_hw_number_t hwirq = hwirq_base; - int i, ret; + struct irq_data *irq_data = irq_get_irq_data(virq); + int ret; - pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + if (WARN(hwirq >= domain->hwirq_max, + "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) + return -EINVAL; + if (WARN(!irq_data, "error: virq%i is not allocated", virq)) + return -EINVAL; + if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) + return -EINVAL; - for (i = 0; i < count; i++) { - struct irq_data *irq_data = irq_get_irq_data(virq + i); - - if (WARN(!irq_data, "error: irq_desc not allocated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - if (WARN(irq_data->domain, "error: irq_desc already associated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - }; - - for (i = 0; i < count; i++, virq++, hwirq++) { - struct irq_data *irq_data = irq_get_irq_data(virq); - - irq_data->hwirq = hwirq; - irq_data->domain = domain; - if (domain->ops->map) { - ret = domain->ops->map(domain, virq, hwirq); - if (ret != 0) { - /* - * If map() returns -EPERM, this interrupt is protected - * by the firmware or some other service and shall not - * be mapped. Don't bother telling the user about it. - */ - if (ret != -EPERM) { - pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - domain->name, hwirq, virq, ret); - } - irq_data->domain = NULL; - irq_data->hwirq = 0; - continue; + mutex_lock(&irq_domain_mutex); + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map) { + ret = domain->ops->map(domain, virq, hwirq); + if (ret != 0) { + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. Don't bother telling the user about it. + */ + if (ret != -EPERM) { + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + domain->name, hwirq, virq, ret); } - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && irq_data->chip) - domain->name = irq_data->chip->name; + irq_data->domain = NULL; + irq_data->hwirq = 0; + mutex_unlock(&irq_domain_mutex); + return ret; } - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = virq; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; + } - irq_clear_status_flags(virq, IRQ_NOREQUEST); + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); } + mutex_unlock(&irq_domain_mutex); + + irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } +EXPORT_SYMBOL_GPL(irq_domain_associate); + +void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + int i; + + pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, + of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + + for (i = 0; i < count; i++) { + irq_domain_associate(domain, irq_base + i, hwirq_base + i); + } +} EXPORT_SYMBOL_GPL(irq_domain_associate_many); /** @@ -460,12 +462,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, if (unlikely(ret < 0)) return ret; - ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); - if (unlikely(ret < 0)) { - irq_free_descs(irq_base, count); - return ret; - } - + irq_domain_associate_many(domain, irq_base, hwirq_base, count); return 0; } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); @@ -535,7 +532,7 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate_many(domain, virq, 1); + irq_domain_disassociate(domain, virq); irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit v1.2.3 From 56a3d5ac774d054ece9373277a861338a468a294 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 01:09:33 +0100 Subject: irqdomain: remove irq_domain_generate_simple() Nobody calls it; remove the function Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 80e92492c77b..e47b35671384 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -741,18 +741,3 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#ifdef CONFIG_OF_IRQ -void irq_domain_generate_simple(const struct of_device_id *match, - u64 phys_base, unsigned int irq_start) -{ - struct device_node *node; - pr_debug("looking for phys_base=%llx, irq_start=%i\n", - (unsigned long long) phys_base, (int) irq_start); - node = of_find_matching_node_by_address(NULL, match, phys_base); - if (node) - irq_domain_add_legacy(node, 32, irq_start, 0, - &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif -- cgit v1.2.3 From d3dcb436f61593843af178d4a520c8c43c04d3fc Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 12:19:17 +0100 Subject: irqdomain: make irq_linear_revmap() a fast path again Over the years, irq_linear_revmap() gained tests and checks to make sure callers were using it safely, which while important, also make it less of a fast path. After the irqdomain refactoring done recently, it is now possible to make irq_linear_revmap() a fast path again. This patch moves irq_linear_revmap() to the header file and makes it a static inline so that interrupt controller drivers using a linear mapping can decode the virq from a hwirq in just a couple of instructions. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e47b35671384..836a0f7ec2a9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -559,35 +559,17 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return hwirq; } - return irq_linear_revmap(domain, hwirq); + /* Check if the hwirq is in the linear revmap. */ + if (hwirq < domain->revmap_size) + return domain->linear_revmap[hwirq]; + + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); -/** - * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path that can be called directly by irq controller code to - * save a handful of instructions. - */ -unsigned int irq_linear_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - struct irq_data *data; - - /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_size) { - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_tree, hwirq); - rcu_read_unlock(); - return data ? data->irq : 0; - } - - return domain->linear_revmap[hwirq]; -} -EXPORT_SYMBOL_GPL(irq_linear_revmap); - #ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) { -- cgit v1.2.3 From c12d2f42a96d72cffa4d9335ca455a2243333c79 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 16:29:19 -0700 Subject: irqdomain: Include hwirq number in /proc/interrupts Add the hardware interrupt number to the output of /proc/interrupts. It is often important to have access to the hardware interrupt number because it identifies exactly how an interrupt signal is wired up to the interrupt controller. This is especially important when using irq_domains since irq numbers get dynamically allocated in that case, and have no relation to the actual hardware number. Note: This output is currently conditional on whether or not the irq_domain pointer is set; however hwirq could still be used without irq_domain. It may be worthwhile to always output the hwirq number regardless of the domain pointer. Signed-off-by: Grant Likely Tested-by: Olof Johansson Cc: Ben Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/proc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 19ed5c425c3b..36f6ee181b0c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v) } else { seq_printf(p, " %8s", "None"); } + if (desc->irq_data.domain) + seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif -- cgit v1.2.3 From 798f0fd188be3656991c8745104b5ee045769a5f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 6 Jun 2013 19:20:27 +0800 Subject: irq: fix checkpatch error ERROR: space required before the open parenthesis '(' WARNING: Prefer pr_warn(... to pr_warning(... Just fix above 2 issue. Signed-off-by: Kefeng Wang Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 836a0f7ec2a9..13f265430c25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -396,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { - pr_warning("irq_create_mapping called for" - " NULL domain, hwirq=%lx\n", hwirq); - WARN_ON(1); + WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } pr_debug("-> using domain @%p\n", domain); @@ -489,8 +487,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, if (intsize > 0) return intspec[0]; #endif - pr_warning("no irq domain found for %s !\n", - of_node_full_name(controller)); + pr_warn("no irq domain found for %s !\n", + of_node_full_name(controller)); return 0; } -- cgit v1.2.3 From 70e5975d3a04be5479a28eec4a2fb10f98ad2785 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 13 Jun 2013 11:39:50 -0700 Subject: clockevents: Prefer CPU local devices over global devices On an SMP system with only one global clockevent and a dummy clockevent per CPU we run into problems. We want the dummy clockevents to be registered as the per CPU tick devices, but we can only achieve that if we register the dummy clockevents before the global clockevent or if we artificially inflate the rating of the dummy clockevents to be higher than the rating of the global clockevent. Failure to do so leads to boot hangs when the dummy timers are registered on all other CPUs besides the CPU that accepted the global clockevent as its tick device and there is no broadcast timer to poke the dummy devices. If we're registering multiple clockevents and one clockevent is global and the other is local to a particular CPU we should choose to use the local clockevent regardless of the rating of the device. This way, if the clockevent is a dummy it will take the tick device duty as long as there isn't a higher rated tick device and any global clockevent will be bumped out into broadcast mode, fixing the problem described above. Reported-and-tested-by: Mark Rutland Signed-off-by: Stephen Boyd Tested-by: soren.brinkmann@xilinx.com Cc: John Stultz Cc: Daniel Lezcano Cc: linux-arm-kernel@lists.infradead.org Cc: John Stultz Link: http://lkml.kernel.org/r/20130613183950.GA32061@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5edfb4806032..edd45f64162f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -243,8 +243,13 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return false; } - /* Use the higher rated one */ - return !curdev || newdev->rating > curdev->rating; + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* -- cgit v1.2.3 From d0667186eb0eab78dcca9f75af6ed03873ca8d9f Mon Sep 17 00:00:00 2001 From: JunweiZhang Date: Wed, 26 Jun 2013 16:40:05 +0800 Subject: kernel: remove unnecessary head file ip_vs.h is not necessary for sysctl_binary.c. prepare for the next patch to avoid compile issue. Signed-off-by: JunweiZhang Signed-off-by: Nicolas Dichtel Reviewed-by: Julian Anastasov Signed-off-by: Simon Horman --- kernel/sysctl_binary.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index aea4a9ea6fc8..b609213ca9a2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -3,7 +3,6 @@ #include "../fs/xfs/xfs_sysctl.h" #include #include -#include #include #include #include -- cgit v1.2.3 From 9e04d3804d3ac97d8c03a41d78d0f0674b5d01e1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 21 May 2013 20:43:50 +0200 Subject: timer: Fix jiffies wrap behavior of round_jiffies_common() Direct compare of jiffies related values does not work in the wrap around case. Replace it with time_is_after_jiffies(). Signed-off-by: Bart Van Assche Cc: Arjan van de Ven Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/519BC066.5080600@acm.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 15ffdb3f1948..15bc1b41021d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; } /** -- cgit v1.2.3 From 7c4c3a0f18ba57ea2a2985034532303d2929902a Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:44 +0100 Subject: hrtimers: Support resuming with two or more CPUs online (but stopped) hrtimers_resume() only reprograms the timers for the current CPU as it assumes that all other CPUs are offline at this point in the resume process. If other CPUs are online then their timers will not be corrected and they may fire at the wrong time. When running as a Xen guest, this assumption is not true. Non-boot CPUs are only stopped with IRQs disabled instead of offlining them. This is a performance optimization as disabling the CPUs would add an unacceptable amount of additional downtime during a live migration (> 200 ms for a 4 VCPU guest). hrtimers_resume() cannot call on_each_cpu(retrigger_next_event,...) as the other CPUs will be stopped with IRQs disabled. Instead, defer the call to the next softirq. [ tglx: Separated the xen change out ] Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-2-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..e86827e94c9a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -773,15 +773,24 @@ void clock_was_set(void) /* * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred to the softirq. + * + * The one-shot timer has already been programmed to fire immediately + * (see tick_resume_oneshot()) and this interrupt will trigger the + * softirq to run early enough to correctly reprogram the timers on + * all CPUs. */ void hrtimers_resume(void) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - retrigger_next_event(NULL); - timerfd_clock_was_set(); + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); } static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -- cgit v1.2.3 From 04397fe94ad65289884b9862b6a0c722ececaadf Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:45 +0100 Subject: timekeeping: Pass flags instead of multiple bools to timekeeping_update() Instead of passing multiple bools to timekeeping_updated(), define flags and use a single 'action' parameter. It is then more obvious what each timekeeping_update() call does. Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-3-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 838fc0777b68..d8b23a929e66 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -27,6 +27,9 @@ #include "ntp_internal.h" #include "timekeeping_internal.h" +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) + static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static seqcount_t timekeeper_seq; @@ -242,16 +245,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) +static void timekeeping_update(struct timekeeper *tk, unsigned int action) { - if (clearntp) { + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } update_vsyscall(tk); update_pvclock_gtod(tk); - if (mirror) + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } @@ -509,7 +512,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -553,7 +556,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -643,7 +646,7 @@ static int change_clocksource(void *data) module_put(new->owner); } } - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -884,7 +887,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -966,7 +969,7 @@ static void timekeeping_resume(void) tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false, true); + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1419,7 +1422,7 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, false, false); + timekeeping_update(real_tk, 0); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.2.3 From 780427f0e113b4c77dfff4d258c05a902cdb0eb9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:46 +0100 Subject: timekeeping: Indicate that clock was set in the pvclock gtod notifier If the clock was set (stepped), set the action parameter to functions in the pvclock gtod notifier chain to non-zero. This allows the callee to only do work if the clock was stepped. This will be used on Xen as the synchronization of the Xen wallclock to the control domain's (dom0) system time will be done with this notifier and updating on every timer tick is unnecessary and too expensive. Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-4-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d8b23a929e66..846d0a1f235e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -29,6 +29,7 @@ #define TK_CLEAR_NTP (1 << 0) #define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -204,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); -static void update_pvclock_gtod(struct timekeeper *tk) +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { - raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** @@ -220,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, true); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; @@ -252,7 +253,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } update_vsyscall(tk); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); @@ -512,7 +513,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -556,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -646,7 +647,7 @@ static int change_clocksource(void *data) module_put(new->owner); } } - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -887,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -969,7 +970,7 @@ static void timekeeping_resume(void) tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1243,9 +1244,10 @@ out_adjust: * It also calls into the NTP code to handle leapsecond processing. * */ -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + unsigned int action = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1268,8 +1270,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_was_set_delayed(); + action = TK_CLOCK_WAS_SET; } } + return action; } /** @@ -1354,6 +1358,7 @@ static void update_wall_time(void) struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; + unsigned int action; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1406,7 +1411,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(tk); + action = accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1422,7 +1427,7 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, 0); + timekeeping_update(real_tk, action); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1684,6 +1689,7 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); + update_pvclock_gtod(tk, true); clock_was_set_delayed(); } write_seqcount_end(&timekeeper_seq); -- cgit v1.2.3 From 6e94a780374ed31b280f939d4757e8d7858dff16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jun 2013 10:58:31 -0400 Subject: tracing: Failed to create system directory Running the following: # cd /sys/kernel/debug/tracing # echo p:i do_sys_open > kprobe_events # echo p:j schedule >> kprobe_events # cat kprobe_events p:kprobes/i do_sys_open p:kprobes/j schedule # echo p:i do_sys_open >> kprobe_events # cat kprobe_events p:kprobes/j schedule p:kprobes/i do_sys_open # ls /sys/kernel/debug/tracing/events/kprobes/ enable filter j Notice that the 'i' is missing from the kprobes directory. The console produces: "Failed to create system directory kprobes" This is because kprobes passes in a allocated name for the system and the ftrace event subsystem saves off that name instead of creating a duplicate for it. But the kprobes may free the system name making the pointer to it invalid. This bug was introduced by 92edca073c37 "tracing: Use direct field, type and system names" which switched from using kstrdup() on the system name in favor of just keeping apointer to it, as the internal ftrace event system names are static and exist for the life of the computer being booted. Instead of reverting back to duplicating system names again, we can use core_kernel_data() to determine if the passed in name was allocated or static. Then use the MSB of the ref_count to be a flag to keep track if the name was allocated or not. Then we can still save from having to duplicate strings that will always exist, but still copy the ones that may be freed. Cc: stable@vger.kernel.org # 3.10 Reported-by: "zhangwei(Jovi)" Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f57b01574a30..903a0bf2685e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; +#define SYSTEM_FL_FREE_NAME (1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} + /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ @@ -344,8 +361,8 @@ static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; - WARN_ON_ONCE(system->ref_count == 0); - if (--system->ref_count) + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) return; list_del(&system->list); @@ -354,13 +371,15 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { - WARN_ON_ONCE(system->ref_count == 0); - system->ref_count++; + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); } static void __get_system_dir(struct ftrace_subsystem_dir *dir) @@ -374,7 +393,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ - WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) @@ -1274,7 +1293,15 @@ create_new_subsystem(const char *name) return NULL; system->ref_count = 1; - system->name = name; + + /* Only allocate if dynamic (kprobes and modules) */ + if (!core_kernel_data((unsigned long)name)) { + system->ref_count |= SYSTEM_FL_FREE_NAME; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) + goto out_free; + } else + system->name = name; system->filter = NULL; @@ -1287,6 +1314,8 @@ create_new_subsystem(const char *name) return system; out_free: + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); return NULL; } -- cgit v1.2.3 From 288e984e622336bab8bc3dfdf2f190816362d9a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:06 +0200 Subject: tracing/kprobes: Avoid perf_trace_buf_*() if ->perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit() make no sense if this task/CPU has no active counters. Change kprobe_perf_func() and kretprobe_perf_func() to check call->perf_events beforehand and return if this list is empty. For example, "perf record -e some_probe -p1". Only /sbin/init will report, all other threads which hit the same probe will do perf_trace_buf_prepare/perf_trace_buf_submit just to realize that nobody wants perf_swevent_event(). Link: http://lkml.kernel.org/r/20130620173806.GA13151@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f2374172ba7b..c35bebe53ffe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1156,6 +1156,10 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); @@ -1171,8 +1175,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); } @@ -1188,6 +1190,10 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); @@ -1203,8 +1209,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head, NULL); } -- cgit v1.2.3 From 3fe3d6193e7cd7b4dd2bde10772f048bdefea4ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:09 +0200 Subject: tracing/kprobes: Kill probe_enable_lock enable_trace_probe() and disable_trace_probe() should not worry about serialization, the caller (perf_trace_init or __ftrace_set_clr_event) holds event_mutex. They are also called by kprobe_trace_self_tests_init(), but this __init function can't race with itself or trace_events.c And note that this code depended on event_mutex even before 41a7dd420c which introduced probe_enable_lock. In fact it assumes that the caller kprobe_register() can never race with itself. Otherwise, say, tp->flags manipulations are racy. Link: http://lkml.kernel.org/r/20130620173809.GA13158@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c35bebe53ffe..282f86cfd304 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -183,16 +183,15 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } +/* + * This and enable_trace_probe/disable_trace_probe rely on event_mutex + * held by the caller, __ftrace_set_clr_event(). + */ static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file; + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); int ret = 0; - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -200,8 +199,6 @@ static int trace_probe_nr_files(struct trace_probe *tp) return ret; } -static DEFINE_MUTEX(probe_enable_lock); - /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -211,8 +208,6 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); @@ -223,7 +218,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) GFP_KERNEL); if (!new) { ret = -ENOMEM; - goto out_unlock; + goto out; } memcpy(new, old, n * sizeof(struct ftrace_event_file *)); new[n] = file; @@ -246,10 +241,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) else ret = enable_kprobe(&tp->rp.kp); } - - out_unlock: - mutex_unlock(&probe_enable_lock); - + out: return ret; } @@ -282,8 +274,6 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); @@ -292,7 +282,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; - goto out_unlock; + goto out; } if (n == 1) { /* Remove the last file */ @@ -303,7 +293,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) GFP_KERNEL); if (!new) { ret = -ENOMEM; - goto out_unlock; + goto out; } /* This copy & check loop copies the NULL stopper too */ @@ -326,10 +316,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) else disable_kprobe(&tp->rp.kp); } - - out_unlock: - mutex_unlock(&probe_enable_lock); - + out: return ret; } @@ -1214,6 +1201,12 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, } #endif /* CONFIG_PERF_EVENTS */ +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) @@ -1379,6 +1372,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) return NULL; } +/* + * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * stage, we can do this lockless. + */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; -- cgit v1.2.3 From a439059610ecd257dba29a612729132e470d118f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 29 Jun 2013 00:08:04 -0500 Subject: tracing: Simplify code for showing of soft disabled flag Rather than enumerating each permutation, build the enable state string up from the combination of states. This also allows for the simpler addition of more states. Link: http://lkml.kernel.org/r/9aff5af6dee2f5a40ca30df41c39d5f33e998d7a.1372479499.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 903a0bf2685e..7ee08b95c384 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -638,17 +638,17 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_file *file = filp->private_data; - char *buf; + char buf[4] = "0"; - if (file->flags & FTRACE_EVENT_FL_ENABLED) { - if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) - buf = "0*\n"; - else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) - buf = "1*\n"; - else - buf = "1\n"; - } else - buf = "0\n"; + if (file->flags & FTRACE_EVENT_FL_ENABLED && + !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || + file->flags & FTRACE_EVENT_FL_SOFT_MODE) + strcat(buf, "*"); + + strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } -- cgit v1.2.3 From 3baa5e4cf224b8a55220cc841bb475e164b84ceb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 29 Jun 2013 00:08:07 -0500 Subject: tracing: Fix disabling of soft disable The comment on the soft disable 'disable' case of __ftrace_event_enable_disable() states that the soft disable bit should be cleared in that case, but currently only the soft mode bit is actually cleared. This essentially leaves the standard non-soft-enable enable/disable paths as the only way to clear the soft disable flag, but the soft disable bit should also be cleared when removing a trigger with '!'. Also, the SOFT_DISABLED bit should never be set if SOFT_MODE is cleared. This fixes the above discrepancies. Link: http://lkml.kernel.org/r/b9c68dd50bc07019e6c67d3f9b29be4ef1b2badb.1372479499.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7ee08b95c384..5892470bc2ee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -291,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, } call->class->reg(call, TRACE_REG_UNREGISTER, file); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* -- cgit v1.2.3 From b04d52e368e2cf526abb2bab61f304eaea126af2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:14 +0200 Subject: tracing/kprobes: Turn trace_probe->files into list_head I think that "ftrace_event_file *trace_probe[]" complicates the code for no reason, turn it into list_head to simplify the code. enable_trace_probe() no longer needs synchronize_sched(). This needs the extra sizeof(list_head) memory for every attached ftrace_event_file, hopefully not a problem in this case. Link: http://lkml.kernel.org/r/20130620173814.GA13165@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 138 ++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f86cfd304..405b5b0f903e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,12 +35,17 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file * __rcu *files; + struct list_head files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + #define SIZEOF_TRACE_PROBE(n) \ (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) @@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; INIT_LIST_HEAD(&tp->list); + INIT_LIST_HEAD(&tp->files); return tp; error: kfree(tp->call.name); @@ -183,22 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } -/* - * This and enable_trace_probe/disable_trace_probe rely on event_mutex - * held by the caller, __ftrace_set_clr_event(). - */ -static int trace_probe_nr_files(struct trace_probe *tp) -{ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - int ret = 0; - - if (file) - while (*(file++)) - ret++; - - return ret; -} - /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -209,29 +199,18 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) int ret = 0; if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - - old = rcu_dereference_raw(tp->files); - /* 1 is for new one and 1 is for stopper */ - new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { ret = -ENOMEM; goto out; } - memcpy(new, old, n * sizeof(struct ftrace_event_file *)); - new[n] = file; - /* The last one keeps a NULL */ - rcu_assign_pointer(tp->files, new); - tp->flags |= TP_FLAG_TRACE; + link->file = file; + list_add_tail_rcu(&link->list, &tp->files); - if (old) { - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); - } + tp->flags |= TP_FLAG_TRACE; } else tp->flags |= TP_FLAG_PROFILE; @@ -245,24 +224,16 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) return ret; } -static int -trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +static struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) { - struct ftrace_event_file **files; - int i; + struct event_file_link *link; - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - files = rcu_dereference_raw(tp->files); - if (files) { - for (i = 0; files[i]; i++) - if (files[i] == file) - return i; - } + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; - return -1; + return NULL; } /* @@ -275,38 +246,23 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) int ret = 0; if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - int i, j; + struct event_file_link *link; - old = rcu_dereference_raw(tp->files); - if (n == 0 || trace_probe_file_index(tp, file) < 0) { + link = find_event_file_link(tp, file); + if (!link) { ret = -EINVAL; goto out; } - if (n == 1) { /* Remove the last file */ - tp->flags &= ~TP_FLAG_TRACE; - new = NULL; - } else { - new = kzalloc(n * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto out; - } - - /* This copy & check loop copies the NULL stopper too */ - for (i = 0, j = 0; j < n && i < n + 1; i++) - if (old[i] != file) - new[j++] = old[i]; - } + list_del_rcu(&link->list); + /* synchronize with kprobe_trace_func/kretprobe_trace_func */ + synchronize_sched(); + kfree(link); - rcu_assign_pointer(tp->files, new); + if (!list_empty(&tp->files)) + goto out; - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); + tp->flags &= ~TP_FLAG_TRACE; } else tp->flags &= ~TP_FLAG_PROFILE; @@ -871,20 +827,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kprobe_trace_func(tp, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kprobe_trace_func(tp, regs, link->file); } /* Kretprobe handler */ @@ -931,20 +877,10 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kretprobe_trace_func(tp, ri, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kretprobe_trace_func(tp, ri, regs, link->file); } /* Event entry printers */ -- cgit v1.2.3 From 10246fa35d4ffdfe472185d4cbf9c2dfd9a9f023 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 15:58:24 -0400 Subject: tracing: Use flag buffer_disabled for irqsoff tracer If the ring buffer is disabled and the irqsoff tracer records a trace it will clear out its buffer and lose the data it had previously recorded. Currently there's a callback when writing to the tracing_of file, but if tracing is disabled via the function tracer trigger, it will not inform the irqsoff tracer to stop recording. By using the "mirror" flag (buffer_disabled) in the trace_array, that keeps track of the status of the trace_array's buffer, it gives the irqsoff tracer a fast way to know if it should record a new trace or not. The flag may be a little behind the real state of the buffer, but it should not affect the trace too much. It's more important for the irqsoff tracer to be fast. Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++------------- kernel/trace/trace_irqsoff.c | 4 +- 2 files changed, 72 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4c9296b1916..0dc50711d656 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -226,9 +226,24 @@ cycle_t ftrace_now(int cpu) return ts; } +/** + * tracing_is_enabled - Show if global_trace has been disabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */ int tracing_is_enabled(void) { - return tracing_is_on(); + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; } /* @@ -341,6 +356,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_on(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_on - enable tracing buffers * @@ -349,15 +381,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void tracing_on(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_on(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 0; + tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); @@ -551,6 +575,23 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ +void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_off(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_off - turn off tracing buffers * @@ -561,15 +602,7 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); */ void tracing_off(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_off(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 1; + tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); @@ -579,14 +612,25 @@ void disable_trace_on_warning(void) tracing_off(); } +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +int tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + return ring_buffer_record_is_on(tr->trace_buffer.buffer); + return !tr->buffer_disabled; +} + /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { - if (global_trace.trace_buffer.buffer) - return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); - return !global_trace.buffer_disabled; + return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -3958,7 +4002,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_enabled() && iter->pos) + if (!tracing_is_on() && iter->pos) break; } @@ -5631,15 +5675,10 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; - if (buffer) - r = ring_buffer_record_is_on(buffer); - else - r = 0; - + r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -5661,11 +5700,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); if (val) { - ring_buffer_record_on(buffer); + tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { - ring_buffer_record_off(buffer); + tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b19d065a28cb..2aefbee93a6d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + if (!tracer_enabled || !tracing_is_enabled()) return; cpu = raw_smp_processor_id(); @@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) else return; - if (!tracer_enabled) + if (!tracer_enabled || !tracing_is_enabled()) return; data = per_cpu_ptr(tr->trace_buffer.data, cpu); -- cgit v1.2.3 From cf6735a4b103b801753748531e3658cdc8cafa5e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:11 +0200 Subject: tracing/kprobes: Don't pass addr=ip to perf_trace_buf_submit() kprobe_perf_func() and kretprobe_perf_func() pass addr=ip to perf_trace_buf_submit() for no reason. This sets perf_sample_data->addr for PERF_SAMPLE_ADDR, we already have perf_sample_data->ip initialized if PERF_SAMPLE_IP. Link: http://lkml.kernel.org/r/20130620173811.GA13161@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 405b5b0f903e..7ed6976493c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1098,8 +1098,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, - entry->ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1132,8 +1131,7 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, - entry->ret_ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From f1ed7c741fcd0c3d7d318e7c19813d89934b9296 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 27 Jun 2013 22:18:06 -0400 Subject: ftrace: Do not run selftest if command line parameter is set If the kernel command line ftrace filter parameters are set (ftrace_filter or ftrace_notrace), force the function self test to pass, with a warning why it was forced. If the user adds a filter to the kernel command line, it is assumed that they know what they are doing, and the self test should just not run instead of failing (which disables function tracing) or clearing the filter, as that will probably annoy the user. If the user wants the selftest to run, the message will tell them why it did not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 +++++ kernel/trace/trace.h | 1 + kernel/trace/trace_selftest.c | 18 ++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26e19105cdcc..67708f46baae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3537,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + static int __init set_ftrace_notrace(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } @@ -3546,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..a88939e666b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,6 +776,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct task_struct *task) { if (list_empty(&ftrace_pids)) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2901e3b88590..a7329b7902f8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -640,13 +640,20 @@ out: * Enable ftrace, sleep 1/10 second, and then read the trace * buffer to see if all is in order. */ -int +__init int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* make sure msleep has been recorded */ msleep(1); @@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) * Pretty much the same than for the function tracer from which the selftest * has been borrowed. */ -int +__init int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr) { int ret; unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs -- cgit v1.2.3 From 2d71619c59fac95a5415a326162fa046161b938c Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Mon, 1 Jul 2013 15:31:24 -0700 Subject: tracing: Make trace_marker use the correct per-instance buffer The trace_marker file was present for each new instance created, but it added the trace mark to the global trace buffer instead of to the instance's buffer. Link: http://lkml.kernel.org/r/1372717885-4543-2-git-send-email-azl@google.com Cc: David Sharp Cc: Vaibhav Nagarnaik Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0dc50711d656..e04e7119633d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4391,6 +4391,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { unsigned long addr = (unsigned long)ubuf; + struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; @@ -4450,7 +4451,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.trace_buffer.buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { -- cgit v1.2.3 From a82274151af2b075163e3c42c828529dee311487 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Mon, 1 Jul 2013 19:37:54 -0700 Subject: tracing: Protect ftrace_trace_arrays list in trace_events.c There are multiple places where the ftrace_trace_arrays list is accessed in trace_events.c without the trace_types_lock held. Link: http://lkml.kernel.org/r/1372732674-22726-1-git-send-email-azl@google.com Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e04e7119633d..e36da7ff59bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -266,7 +266,7 @@ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a88939e666b7..2c3cba59552d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -224,6 +224,8 @@ enum { extern struct list_head ftrace_trace_arrays; +extern struct mutex trace_types_lock; + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5892470bc2ee..35c6f23c71b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1008,6 +1008,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) int ret; /* Make sure the system still exists */ + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { @@ -1023,6 +1024,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) } exit_loop: mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); if (!system) return -ENODEV; @@ -1617,6 +1619,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call, int trace_add_event_call(struct ftrace_event_call *call) { int ret; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); ret = __register_event(call, NULL); @@ -1624,11 +1627,13 @@ int trace_add_event_call(struct ftrace_event_call *call) __add_event_to_tracers(call, NULL); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return ret; } /* - * Must be called under locking both of event_mutex and trace_event_sem. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { @@ -1640,11 +1645,13 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) /* Remove an event_call */ void trace_remove_event_call(struct ftrace_event_call *call) { + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); down_write(&trace_event_sem); __trace_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); } #define for_each_event(event, start, end) \ @@ -1788,6 +1795,7 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); switch (val) { case MODULE_STATE_COMING: @@ -1798,6 +1806,7 @@ static int trace_module_notify(struct notifier_block *self, break; } mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return 0; } -- cgit v1.2.3 From 4f6de4d51f4a3ab06a85e91e708cc89a513ef30c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 2 Jul 2013 15:35:11 +0930 Subject: module: don't modify argument of module_kallsyms_lookup_name() If we pass a pointer to a const string in the form "module:symbol" module_kallsyms_lookup_name() will try to split the string at the colon, i.e., will try to modify r/o data. That will, in fact, fail on a kernel with enabled CONFIG_DEBUG_RODATA. Avoid modifying the passed string in module_kallsyms_lookup_name(), modify find_module_all() instead to pass it the module name length. Signed-off-by: Mathias Krause Signed-off-by: Rusty Russell --- kernel/module.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..a1951aba7a03 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -static struct module *find_module_all(const char *name, +static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; @@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name, list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; - if (strcmp(mod->name, name) == 0) + if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) return mod; } return NULL; @@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name, struct module *find_module(const char *name) { - return find_module_all(name, false); + return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -3027,7 +3027,7 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module_all(name, true); + mod = find_module_all(name, strlen(name), true); ret = !mod || mod->state == MODULE_STATE_LIVE || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); @@ -3165,7 +3165,8 @@ static int add_unformed_module(struct module *mod) again: mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { + old = find_module_all(mod->name, strlen(mod->name), true); + if (old != NULL) { if (old->state == MODULE_STATE_COMING || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ @@ -3576,10 +3577,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); if ((colon = strchr(name, ':')) != NULL) { - *colon = '\0'; - if ((mod = find_module(name)) != NULL) + if ((mod = find_module_all(name, colon - name, false)) != NULL) ret = mod_find_symname(mod, colon+1); - *colon = ':'; } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) -- cgit v1.2.3 From b634d130e46a093ddf716ae9cf1bfa258ede36cf Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 2 Jul 2013 15:35:11 +0930 Subject: There is no /sys/parameters There is no such path as /sys/parameters, module parameters live in /sys/module/*/parameters. Signed-off-by: Jean Delvare Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 53b958fcd639..440e65d1a544 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name, } /* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * -- cgit v1.2.3 From 54041d8a73337411b485ff76957fb106cb5d40d0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 2 Jul 2013 15:35:12 +0930 Subject: modules: don't fail to load on unknown parameters. Although parameters are supposed to be part of the kernel API, experimental parameters are often removed. In addition, downgrading a kernel might cause previously-working modules to fail to load. On balance, it's probably better to warn, and load the module anyway. This may let through a typo, but at least the logs will show it. Reported-by: Andy Lutomirski Signed-off-by: Rusty Russell --- kernel/module.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a1951aba7a03..5184877ce98a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3212,6 +3212,17 @@ out: return err; } +static int unknown_module_param_cb(char *param, char *val, const char *modname) +{ + /* Check for magic 'dyndbg' arg */ + int ret = ddebug_dyndbg_module_param_cb(param, val, modname); + if (ret != 0) { + printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", + modname, param); + } + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, @@ -3298,7 +3309,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, &ddebug_dyndbg_module_param_cb); + -32768, 32767, unknown_module_param_cb); if (err < 0) goto bug_cleanup; -- cgit v1.2.3 From c9b5a266b103af873abb9ac03bc3d067702c8f4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jun 2013 12:17:32 +0200 Subject: tick: Make oneshot broadcast robust vs. CPU offlining In periodic mode we remove offline cpus from the broadcast propagation mask. In oneshot mode we fail to do so. This was not a problem so far, but the recent changes to the broadcast propagation introduced a constellation which can result in a NULL pointer dereference. What happens is: CPU0 CPU1 idle() arch_idle() tick_broadcast_oneshot_control(OFF); set cpu1 in tick_broadcast_force_mask if (cpu_offline()) arch_cpu_dead() cpu_dead_cleanup(cpu1) cpu1 tickdevice pointer = NULL broadcast interrupt dereference cpu1 tickdevice pointer -> OOPS We dereference the pointer because cpu1 is still set in tick_broadcast_force_mask and tick_do_broadcast() expects a valid cpumask and therefor lacks any further checks. Remove the cpu from the tick_broadcast_force_mask before we set the tick device pointer to NULL. Also add a sanity check to the oneshot broadcast function, so we can detect such issues w/o crashing the machine. Reported-by: Prarit Bhargava Cc: athorlton@sgi.com Cc: CAI Qian Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1306261303260.4013@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d067c01586f5..4790037163f6 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -532,6 +532,13 @@ again: cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + /* * Wakeup the cpus which have an expired event. */ @@ -773,10 +780,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From 1f73a9806bdd07a5106409bbcab3884078bd34fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 1 Jul 2013 22:14:10 +0200 Subject: tick: Prevent uncontrolled switch to oneshot mode When the system switches from periodic to oneshot mode, the broadcast logic causes a possibility that a CPU which has not yet switched to oneshot mode puts its own clock event device into oneshot mode without updating the state and the timer handler. CPU0 CPU1 per cpu tickdev is in periodic mode and switched to broadcast Switch to oneshot mode tick_broadcast_switch_to_oneshot() cpumask_copy(tick_oneshot_broacast_mask, tick_broadcast_mask); broadcast device mode = oneshot Timer interrupt irq_enter() tick_check_oneshot_broadcast() dev->set_mode(ONESHOT); tick_handle_periodic() if (dev->mode == ONESHOT) dev->next_event += period; FAIL. We fail, because dev->next_event contains KTIME_MAX, if the device was in periodic mode before the uncontrolled switch to oneshot happened. We must copy the broadcast bits over to the oneshot mask, because otherwise a CPU which relies on the broadcast would not been woken up anymore after the broadcast device switched to oneshot mode. So we need to verify in tick_check_oneshot_broadcast() whether the CPU has already switched to oneshot mode. If not, leave the device untouched and let the CPU switch controlled into oneshot mode. This is a long standing bug, which was never noticed, because the main user of the broadcast x86 cannot run into that scenario, AFAICT. The nonarchitected timer mess of ARM creates a gazillion of differently broken abominations which trigger the shortcomings of that broadcast code, which better had never been necessary in the first place. Reported-and-tested-by: Stehle Vincent-B46079 Reviewed-by: Stephen Boyd Cc: John Stultz , Cc: Mark Rutland Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307012153060.4013@ionos.tec.linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 4790037163f6..248f80dba746 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -492,7 +492,15 @@ void tick_check_oneshot_broadcast(int cpu) if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_set_mode(td->evtdev, + CLOCK_EVT_MODE_ONESHOT); + } } } -- cgit v1.2.3 From 07bd1172902e782f288e4d44b1fde7dec0f08b6f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 1 Jul 2013 22:14:10 +0200 Subject: tick: Sanitize broadcast control logic The recent implementation of a generic dummy timer resulted in a different registration order of per cpu local timers which made the broadcast control logic go belly up. If the dummy timer is the first clock event device which is registered for a CPU, then it is installed, the broadcast timer is initialized and the CPU is marked as broadcast target. If a real clock event device is installed after that, we can fail to take the CPU out of the broadcast mask. In the worst case we end up with two periodic timer events firing for the same CPU. One from the per cpu hardware device and one from the broadcast. Now the problem is that we have no way to distinguish whether the system is in a state which makes broadcasting necessary or the broadcast bit was set due to the nonfunctional dummy timer installment. To solve this we need to keep track of the system state seperately and provide a more detailed decision logic whether we keep the CPU in broadcast mode or not. The old decision logic only clears the broadcast mode, if the newly installed clock event device is not affected by power states. The new logic clears the broadcast mode if one of the following is true: - The new device is not affected by power states. - The system is not in a power state affected mode - The system has switched to oneshot mode. The oneshot broadcast is controlled from the deep idle state. The CPU is not in idle at this point, so it's safe to remove it from the mask. If we clear the broadcast bit for the CPU when a new device is installed, we also shutdown the broadcast device when this was the last CPU in the broadcast mask. If the broadcast bit is kept, then we leave the new device in shutdown state and rely on the broadcast to deliver the timer interrupts via the broadcast ipis. Reported-and-tested-by: Stehle Vincent-B46079 Reviewed-by: Stephen Boyd Cc: John Stultz , Cc: Mark Rutland Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307012153060.4013@ionos.tec.linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 70 +++++++++++++++++++++++++++++++++++++------- kernel/time/tick-common.c | 3 +- 2 files changed, 61 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 248f80dba746..4430fa695b48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -30,6 +30,7 @@ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -140,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret = 0; + int ret; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -155,20 +157,59 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + tick_broadcast_start_periodic(bc); ret = 1; } else { /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. + * Clear the broadcast bit for this cpu if the + * device is not power state affected. */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); - tick_broadcast_clear_oneshot(cpu); - } else { + else tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device. + */ + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + /* Nothing to do */ + ret = 0; + break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -298,6 +339,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) @@ -307,8 +349,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_force) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -366,6 +412,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) @@ -821,6 +868,7 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index edd45f64162f..64522ecdfe0e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td, * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic - * way. + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; -- cgit v1.2.3 From ff451961a8b2a17667a7bfa39c86fb9b351445db Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 22:50:29 -0400 Subject: tracing: Add trace_array_get/put() to handle instance refs better Commit a695cb58162 "tracing: Prevent deleting instances when they are being read" tried to fix a race between deleting a trace instance and reading contents of a trace file. But it wasn't good enough. The following could crash the kernel: # cd /sys/kernel/debug/tracing/instances # ( while :; do mkdir foo; rmdir foo; done ) & # ( while :; do cat foo/trace &> /dev/null; done ) & Luckily this can only be done by root user, but it should be fixed regardless. The problem is that a delete of the file can happen after the reader starts to open the file but before it grabs the trace_types_mutex. The solution is to validate the trace array before using it. If the trace array does not exist in the list of trace arrays, then it returns -ENODEV. There's a possibility that a trace_array could be deleted and a new one created and the open would open its file instead. But that is very minor as it will just return the data of the new trace array, it may confuse the user but it will not crash the system. As this can only be done by root anyway, the race will only occur if root is deleting what its trying to read at the same time. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 83 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e36da7ff59bf..6be9df1aa513 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -204,6 +204,37 @@ static struct trace_array global_trace; LIST_HEAD(ftrace_trace_arrays); +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ + WARN_ON(!this_tr->ref); + this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} + int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) @@ -2831,10 +2862,9 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file, bool snapshot) +__tracing_open(struct trace_array *tr, struct trace_cpu *tc, + struct inode *inode, struct file *file, bool snapshot) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; struct trace_iterator *iter; int cpu; @@ -2913,8 +2943,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) tracing_iter_reset(iter, cpu); } - tr->ref++; - mutex_unlock(&trace_types_lock); return iter; @@ -2944,17 +2972,20 @@ static int tracing_release(struct inode *inode, struct file *file) struct trace_array *tr; int cpu; - if (!(file->f_mode & FMODE_READ)) + /* Writes do not use seq_file, need to grab tr from inode */ + if (!(file->f_mode & FMODE_READ)) { + struct trace_cpu *tc = inode->i_private; + + trace_array_put(tc->tr); return 0; + } iter = m->private; tr = iter->tr; + trace_array_put(tr); mutex_lock(&trace_types_lock); - WARN_ON(!tr->ref); - tr->ref--; - for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2973,20 +3004,23 @@ static int tracing_release(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); + return 0; } static int tracing_open(struct inode *inode, struct file *file) { + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - if (tc->cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->trace_buffer); else @@ -2994,12 +3028,16 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, false); + iter = __tracing_open(tr, tc, inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } + + if (ret < 0) + trace_array_put(tr); + return ret; } @@ -4575,12 +4613,16 @@ struct ftrace_buffer_info { static int tracing_snapshot_open(struct inode *inode, struct file *file) { struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, true); + iter = __tracing_open(tr, tc, inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { @@ -4593,13 +4635,16 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) kfree(m); return -ENOMEM; } - iter->tr = tc->tr; + iter->tr = tr; iter->trace_buffer = &tc->tr->max_buffer; iter->cpu_file = tc->cpu; m->private = iter; file->private_data = m; } + if (ret < 0) + trace_array_put(tr); + return ret; } @@ -4680,9 +4725,12 @@ out: static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) - return tracing_release(inode, file); + return ret; /* If write only, the seq_file is just a stub */ if (m) @@ -4927,8 +4975,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - WARN_ON(!iter->tr->ref); - iter->tr->ref--; + __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); -- cgit v1.2.3 From 7b85af63034818e43aee6c1d7bf1c7c6796a9073 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 23:34:22 -0400 Subject: tracing: Get trace_array ref counts when accessing trace files When a trace file is opened that may access a trace array, it must increment its ref count to prevent it from being deleted. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6be9df1aa513..6d9bd9b43e43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2965,6 +2965,43 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; + +} + +int tracing_open_generic_tc(struct inode *inode, struct file *filp) +{ + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; + +} + static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; @@ -3008,6 +3045,32 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return 0; +} + +static int tracing_release_generic_tc(struct inode *inode, struct file *file) +{ + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; + + trace_array_put(tr); + return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + static int tracing_open(struct inode *inode, struct file *file) { struct trace_cpu *tc = inode->i_private; @@ -3394,9 +3457,14 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + return single_open(file, tracing_trace_options_show, inode->i_private); } @@ -3404,7 +3472,7 @@ static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; @@ -3892,6 +3960,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + mutex_lock(&trace_types_lock); /* create a buffer to store the information to pass to userspace */ @@ -3944,6 +4015,7 @@ out: fail: kfree(iter->trace); kfree(iter); + __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } @@ -3951,6 +4023,8 @@ fail: static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; mutex_lock(&trace_types_lock); @@ -3964,6 +4038,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter); + trace_array_put(tr); + return 0; } @@ -4421,6 +4497,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + trace_array_put(tr); + return 0; } @@ -4597,10 +4675,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, static int tracing_clock_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + int ret; + if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, inode->i_private); + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } struct ftrace_buffer_info { @@ -4796,34 +4884,38 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tc, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tc, }; static const struct file_operations tracing_total_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .write = tracing_mark_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_clock_write, }; @@ -4851,13 +4943,19 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) struct trace_cpu *tc = inode->i_private; struct trace_array *tr = tc->tr; struct ftrace_buffer_info *info; + int ret; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) + if (!info) { + trace_array_put(tr); return -ENOMEM; + } mutex_lock(&trace_types_lock); @@ -4875,7 +4973,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_unlock(&trace_types_lock); - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; } static unsigned int @@ -5765,9 +5867,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, } static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, + .release = tracing_release_generic_tr, .llseek = default_llseek, }; -- cgit v1.2.3 From 8e2e2fa47129532a30cff6c25a47078dc97d9260 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 15:30:53 -0400 Subject: tracing: Add trace_array_get/put() to event handling Commit a695cb58162 "tracing: Prevent deleting instances when they are being read" tried to fix a race between deleting a trace instance and reading contents of a trace file. But it wasn't good enough. The following could crash the kernel: # cd /sys/kernel/debug/tracing/instances # ( while :; do mkdir foo; rmdir foo; done ) & # ( while :; do echo 1 > foo/events/sched/sched_switch 2> /dev/null; done ) & Luckily this can only be done by root user, but it should be fixed regardless. The problem is that a delete of the file can happen after the write to the event is opened, but before the enabling happens. The solution is to make sure the trace_array is available before succeeding in opening for write, and incerment the ref counter while opened. Now the instance can be deleted when the events are writing to the buffer, but the deletion of the instance will disable all events before the instance is actually deleted. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 55 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2c3cba59552d..c7fbf93f1b7c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -226,6 +226,9 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 35c6f23c71b2..920e08fb53b3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -409,6 +409,35 @@ static void put_system(struct ftrace_subsystem_dir *dir) mutex_unlock(&event_mutex); } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_file(struct inode *inode, struct file *filp) +{ + struct ftrace_event_file *file = inode->i_private; + struct trace_array *tr = file->tr; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int tracing_release_generic_file(struct inode *inode, struct file *filp) +{ + struct ftrace_event_file *file = inode->i_private; + struct trace_array *tr = file->tr; + + trace_array_put(tr); + + return 0; +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1032,9 +1061,17 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Some versions of gcc think dir can be uninitialized here */ WARN_ON(!dir); + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); + return -ENODEV; + } + ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); put_system(dir); + } return ret; } @@ -1045,16 +1082,23 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) + if (!dir) { + trace_array_put(tr); return -ENOMEM; + } dir->tr = tr; ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); kfree(dir); + } filp->private_data = dir; @@ -1065,6 +1109,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct ftrace_subsystem_dir *dir = file->private_data; + trace_array_put(dir->tr); + /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable @@ -1192,9 +1238,10 @@ static const struct file_operations ftrace_set_event_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_file, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_generic_file, .llseek = default_llseek, }; -- cgit v1.2.3 From 2a6c24afab70dbcfee49f4c76e1511eec1a3298b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 14:48:23 -0400 Subject: tracing: Fix race between deleting buffer and setting events While analyzing the code, I discovered that there's a potential race between deleting a trace instance and setting events. There are a few races that can occur if events are being traced as the buffer is being deleted. Mostly the problem comes with freeing the descriptor used by the trace event callback. To prevent problems like this, the events are disabled before the buffer is deleted. The problem with the current solution is that the event_mutex is let go between disabling the events and freeing the files, which means that the events could be enabled again while the freeing takes place. Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 920e08fb53b3..7d854290bf81 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -441,14 +441,14 @@ static int tracing_release_generic_file(struct inode *inode, struct file *filp) /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; - mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; @@ -474,6 +474,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, ret = 0; } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; @@ -2408,11 +2419,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) int event_trace_del_tracer(struct trace_array *tr) { - /* Disable any running events */ - __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); - mutex_lock(&event_mutex); + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); -- cgit v1.2.3 From fa44063f9ef163c3a4c8d8c0465bb8a056b42035 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Thu, 13 Jun 2013 14:21:51 +0800 Subject: uprobes: Fix return value in error handling path When wrong argument is passed into uprobe_events it does not return an error: [root@jovi tracing]# echo 'p:myprobe /bin/bash' > uprobe_events [root@jovi tracing]# The proper response is: [root@jovi tracing]# echo 'p:myprobe /bin/bash' > uprobe_events -bash: echo: write error: Invalid argument Link: http://lkml.kernel.org/r/51B964FF.5000106@huawei.com Cc: Frederic Weisbecker Cc: Cc: stable@vger.kernel.org # 3.5+ Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 32494fb0ee64..d5d0cd368a56 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -283,8 +283,10 @@ static int create_trace_uprobe(int argc, char **argv) return -EINVAL; } arg = strchr(argv[1], ':'); - if (!arg) + if (!arg) { + ret = -EINVAL; goto fail_address_parse; + } *arg++ = '\0'; filename = argv[1]; -- cgit v1.2.3 From 11034ae9c20f4057a6127fc965906417978e69b2 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:23 +0800 Subject: tracing: Fix irqs-off tag display in syscall tracing All syscall tracing irqs-off tags are wrong, the syscall enter entry doesn't disable irqs. [root@jovi tracing]#echo "syscalls:sys_enter_open" > set_event [root@jovi tracing]# cat trace # tracer: nop # # entries-in-buffer/entries-written: 13/13 #P:2 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | irqbalance-513 [000] d... 56115.496766: sys_open(filename: 804e1a6, flags: 0, mode: 1b6) irqbalance-513 [000] d... 56115.497008: sys_open(filename: 804e1bb, flags: 0, mode: 1b6) sendmail-771 [000] d... 56115.827982: sys_open(filename: b770e6d1, flags: 0, mode: 1b6) The reason is syscall tracing doesn't record irq_flags into buffer. The proper display is: [root@jovi tracing]#echo "syscalls:sys_enter_open" > set_event [root@jovi tracing]# cat trace # tracer: nop # # entries-in-buffer/entries-written: 14/14 #P:2 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | irqbalance-514 [001] .... 46.213921: sys_open(filename: 804e1a6, flags: 0, mode: 1b6) irqbalance-514 [001] .... 46.214160: sys_open(filename: 804e1bb, flags: 0, mode: 1b6) <...>-920 [001] .... 47.307260: sys_open(filename: 4e82a0c5, flags: 80000, mode: 0) Link: http://lkml.kernel.org/r/1365564393-10972-3-git-send-email-jovi.zhangwei@huawei.com Cc: stable@vger.kernel.org # 2.6.35 Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8f2ac73c7a5f..322e16461072 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; int size; @@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->enter_event->event.type, size, 0, 0); + sys_data->enter_event->event.type, size, irq_flags, pc); if (!event) return; @@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!filter_current_check_discard(buffer, sys_data->enter_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->exit_event->event.type, sizeof(*entry), 0, 0); + sys_data->exit_event->event.type, sizeof(*entry), + irq_flags, pc); if (!event) return; @@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!filter_current_check_discard(buffer, sys_data->exit_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, -- cgit v1.2.3 From 5280bcef91e706770cc1706eb97353e3513322b9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 19:59:57 -0400 Subject: tracing: Make tracer_tracing_{off,on,is_on}() static I have patches that will use tracer_tracing_on/off/is_on() in other files, but as they are not ready to be merged yet, and Fengguang Wu's sparse scripts pointed out that these functions were not declared anywhere, I'll make them static for now. When these functions are required to be used elsewhere, I'll remove the static then. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6d9bd9b43e43..48aceb8a0328 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -387,7 +387,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; -void tracer_tracing_on(struct trace_array *tr) +static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -606,7 +606,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -void tracer_tracing_off(struct trace_array *tr) +static void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); @@ -649,7 +649,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -int tracer_tracing_is_on(struct trace_array *tr) +static int tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); -- cgit v1.2.3 From 4480361c3c592fcbce3ef74e030719f0715e3a7e Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:28 +0800 Subject: tracing: Remove TRACE_EVENT_TYPE enum definition TRACE_EVENT_TYPE enum is not used at present, remove it. Link: http://lkml.kernel.org/r/1365564393-10972-8-git-send-email-jovi.zhangwei@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c7fbf93f1b7c..1cbba04976b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -907,12 +907,6 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { - TRACE_EVENT_TYPE_PRINTF = 1, - TRACE_EVENT_TYPE_RAW = 2, -}; - struct ftrace_event_field { struct list_head link; const char *name; -- cgit v1.2.3 From 8de1eb02778b64f8b292db531cf39a429f84315f Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:30 +0800 Subject: tracing: Remove ftrace() function The only caller of function ftrace(...) was removed a long time ago, so remove the function body as well. Link: http://lkml.kernel.org/r/1365564393-10972-10-git-send-email-jovi.zhangwei@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 --------- kernel/trace/trace.h | 5 ----- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 48aceb8a0328..f6fed9e51c64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1637,15 +1637,6 @@ trace_function(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, ip, parent_ip, flags, pc); -} - #ifdef CONFIG_STACKTRACE #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1cbba04976b4..a4ed382dea2f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -559,11 +559,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu); void poll_wait_pipe(struct trace_iterator *iter); -void ftrace(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long ip, - unsigned long parent_ip, - unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, -- cgit v1.2.3 From dcc302232c1f9b3ca16f6b8ee190eb0b1a8a0da3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 20:30:52 -0400 Subject: tracing: Make tracing_open_generic_{tr,tc}() static I have patches that will use tracing_open_generic_tr/tc() in other files, but as they are not ready to be merged yet, and Fengguang Wu's sparse scripts pointed out that these functions were not declared anywhere, I'll make them static for now. When these functions are required to be used elsewhere, I'll remove the static then. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6fed9e51c64..dc473b51415f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2960,7 +2960,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -int tracing_open_generic_tr(struct inode *inode, struct file *filp) +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -2976,7 +2976,7 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) } -int tracing_open_generic_tc(struct inode *inode, struct file *filp) +static int tracing_open_generic_tc(struct inode *inode, struct file *filp) { struct trace_cpu *tc = inode->i_private; struct trace_array *tr = tc->tr; -- cgit v1.2.3 From 8d8022e8aba85192e937f1f0f7450e256d66ae5c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Jul 2013 10:06:28 +0930 Subject: module: do percpu allocation after uniqueness check. No, really! v3.8-rc1-5-g1fb9341 was supposed to stop parallel kvm loads exhausting percpu memory on large machines: Now we have a new state MODULE_STATE_UNFORMED, we can insert the module into the list (and thus guarantee its uniqueness) before we allocate the per-cpu region. In my defence, it didn't actually say the patch did this. Just that we "can". This patch actually *does* it. Signed-off-by: Rusty Russell Tested-by: Jim Hull Cc: stable@kernel.org # 3.8 --- kernel/module.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5184877ce98a..d1a161be7b04 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2940,7 +2940,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; - Elf_Shdr *pcpusec; int err; mod = setup_load_info(info, flags); @@ -2955,17 +2954,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) err = module_frob_arch_sections(info->hdr, info->sechdrs, info->secstrings, mod); if (err < 0) - goto out; + return ERR_PTR(err); - pcpusec = &info->sechdrs[info->index.pcpu]; - if (pcpusec->sh_size) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, - pcpusec->sh_size, pcpusec->sh_addralign); - if (err) - goto out; - pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; - } + /* We will do a special allocation for per-cpu sections later. */ + info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2976,17 +2968,22 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_percpu; + return ERR_PTR(err); /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; +} -free_percpu: - percpu_modfree(mod); -out: - return ERR_PTR(err); +static int alloc_module_percpu(struct module *mod, struct load_info *info) +{ + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + if (!pcpusec->sh_size) + return 0; + + /* We have a special allocation for this section. */ + return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign); } /* mod is no longer valid after this! */ @@ -3262,6 +3259,11 @@ static int load_module(struct load_info *info, const char __user *uargs, } #endif + /* To avoid stressing percpu allocator, do this once we're unique. */ + err = alloc_module_percpu(mod, info); + if (err) + goto unlink_mod; + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) -- cgit v1.2.3 From 9eb76d7797b892a1dad4f2efb6f786681306dd13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Jul 2013 10:06:29 +0930 Subject: module: cleanup call chain. Fold alloc_module_percpu into percpu_modalloc(). Signed-off-by: Rusty Russell --- kernel/module.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d1a161be7b04..c18107942ac2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod) return mod->percpu; } -static int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + unsigned long align = pcpusec->sh_addralign; + + if (!pcpusec->sh_size) + return 0; + if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - mod->percpu = __alloc_reserved_percpu(size, align); + mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { printk(KERN_WARNING "%s: Could not allocate %lu bytes percpu data\n", - mod->name, size); + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } - mod->percpu_size = size; + mod->percpu_size = pcpusec->sh_size; return 0; } @@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { - return -ENOMEM; + /* UP modules shouldn't have this section: ENOMEM isn't quite right */ + if (info->sechdrs[info->index.pcpu].sh_size != 0) + return -ENOMEM; + return 0; } static inline void percpu_modfree(struct module *mod) { @@ -2976,16 +2983,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) return mod; } -static int alloc_module_percpu(struct module *mod, struct load_info *info) -{ - Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; - if (!pcpusec->sh_size) - return 0; - - /* We have a special allocation for this section. */ - return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign); -} - /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { @@ -3260,7 +3257,7 @@ static int load_module(struct load_info *info, const char __user *uargs, #endif /* To avoid stressing percpu allocator, do this once we're unique. */ - err = alloc_module_percpu(mod, info); + err = percpu_modalloc(mod, info); if (err) goto unlink_mod; -- cgit v1.2.3 From 55ccb616a6e42052edb37e9c4f82cf8854a59429 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timer: consolidate expiry time type The posix cpu timer expiry time is stored in a union of two types: a 64 bits field if we rely on scheduler precise accounting, or a cputime_t if we rely on jiffies. This results in quite some duplicate code and special cases to handle the two types. Just unify this into a single 64 bits field. cputime_t can always fit into it. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 266 ++++++++++++++++++---------------------------- 1 file changed, 106 insertions(+), 160 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c3c4ea1225a4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -461,30 +414,30 @@ static void cleanup_timers(struct list_head *head, list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(ptime)) { + timer->expires = 0; } else { - timer->expires.cpu -= ptime; + timer->expires -= cputime_to_expires(ptime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(utime)) { + timer->expires = 0; } else { - timer->expires.cpu -= utime; + timer->expires -= cputime_to_expires(utime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; + if (timer->expires < sum_exec_runtime) { + timer->expires = 0; } else { - timer->expires.sched -= sum_exec_runtime; + timer->expires -= sum_exec_runtime; } } } @@ -516,7 +469,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *timer, unsigned long long now) { /* * That's all for this thread or process. @@ -524,9 +477,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) */ put_task_struct(timer->it.cpu.task); timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + timer->it.cpu.expires -= now; } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +509,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +527,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +552,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +583,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +592,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +645,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +700,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +715,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +739,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +749,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +773,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +795,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +805,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +837,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +858,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -946,8 +891,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; + if (!--maxfire || prof_ticks(tsk) < t->expires) { + tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -961,8 +906,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; + if (!--maxfire || virt_ticks(tsk) < t->expires) { + tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -976,8 +921,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { + tsk->cputime_expires.sched_exp = t->expires; break; } t->firing = 1; @@ -1030,7 +975,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1068,7 +1014,7 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,8 +1024,8 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = 0; @@ -1087,8 +1033,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; + if (!--maxfire || ptime < tl->expires) { + prof_expires = tl->expires; break; } tl->firing = 1; @@ -1102,8 +1048,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; + if (!--maxfire || utime < tl->expires) { + virt_expires = tl->expires; break; } tl->firing = 1; @@ -1117,8 +1063,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires) { + sched_expires = tl->expires; break; } tl->firing = 1; @@ -1162,8 +1108,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1151,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1387,7 +1333,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1345,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1405,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. -- cgit v1.2.3 From 1a7fa510b38e518d11365883934f1afa41625424 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timers: consolidate timer list cleanups Cleaning up the posix cpu timers on task exit shares some common code among timer list types, most notably the list traversal and expiry time update. Unify this in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 48 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c3c4ea1225a4..b1450cee6d6d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -399,6 +399,21 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (timer->expires < curr) { + timer->expires = 0; + } else { + timer->expires -= curr; + } + } +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -409,37 +424,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(ptime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(ptime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(utime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(utime); - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < sum_exec_runtime) { - timer->expires = 0; - } else { - timer->expires -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* -- cgit v1.2.3 From 2473f3e7a97ce8bc0fe7596cdb361b21221418eb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_cpu_timers: consolidate expired timers check Consolidate the common code amongst per thread and per process timers list on tick time. List traversal, expiry check and subsequent updates can be shared in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 118 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b1450cee6d6d..92a4fbf44f86 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -862,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -870,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires) { - tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires) { - tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { - tsk->cputime_expires.sched_exp = t->expires; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1002,7 +990,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; @@ -1017,49 +1004,10 @@ static void check_process_timers(struct task_struct *tsk, utime = cputime_to_expires(cputime.utime); ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires) { - prof_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires) { - virt_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires) { - sched_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. -- cgit v1.2.3 From 76cdcdd979ce00f5037804d73da583fb488ec1b2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix-timers: correctly get dying task time sample in posix_cpu_timer_schedule() In order to re-arm a timer after it fired, we take a sample of the current process or thread cputime. If the task is dying though, we don't arm anything but we cache the remaining timer expiration delta for further reads. Something similar is performed in posix_cpu_timer_get() but here we forget to take the process wide cputime sample before caching it. As a result we are storing random stack content, leading every further reads of that timer to return junk values. Fix this by taking the appropriate sample in the case of process wide timers. This probably doesn't matter much in practice because, at this stage, the thread is the last one in the group and we reached exit_notify(). This implies that we called exit_itimers() and there should be no more timers to handle for that task. So this is likely dead code anyway but let's fix the current logic and the warning that came along: kernel/posix-cpu-timers.c: In function 'posix_cpu_timer_schedule': kernel/posix-cpu-timers.c:1127: warning: 'now' may be used uninitialized in this function Then we can start to think further about cleaning up that code. Reported-by: Andrew Morton Reported-by: Chen Gang Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Chen Gang Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 92a4fbf44f86..4ebd8ad07c66 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1097,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } -- cgit v1.2.3 From a0b2062b0904ef07944c4a6e4d0f88ee44f1e9f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_timers: fix racy timer delta caching on task exit When a task exits, we perform a caching of the remaining cputime delta before expiring of its timers. This is done from the following places: * When the task is reaped. We iterate through its list of posix cpu timers and store the remaining timer delta to the timer struct instead of the absolute value. (See posix_cpu_timers_exit() / posix_cpu_timers_exit_group() ) * When we call posix_cpu_timer_get() or posix_cpu_timer_schedule(). If the timer's task is considered dying when watched from these places, the same conversion from absolute to relative expiry time is performed. Then the given task's reference is released. (See clear_dead_task() ). The relevance of this caching is questionable but this is another and deeper debate. The big issue here is that these two sources of caching don't mix up very well together. More specifically, the caching can easily be done twice, resulting in a wrong delta as it gets spuriously substracted a second time by the elapsed clock. This can happen in the following scenario: 1) The task exits and gets reaped: we call posix_cpu_timers_exit() and the absolute timer expiry values are converted to a relative delta. 2) timer_gettime() -> posix_cpu_timer_get() is called and relies on clear_dead_task() because tsk->exit_state == EXIT_DEAD. The delta gets substracted again by the elapsed clock and we return a wrong result. To fix this, just remove the caching done on task reaping time. It doesn't bring much value on its own. The caching done from posix_cpu_timer_get/schedule is enough. And it would also be hard to get it really right: we could make it put and clear the target task in the timer struct so that readers know if they are dealing with a relative cached of absolute value. But it would be racy. The only safe way to do it would be to lock the itimer->it_lock so that we know nobody reads the cputime expiry value while we modify it and its target task reference. Doing so would involve some funny workarounds to avoid circular lock against the sighand lock. There is just no reason to maintain this. The user visible effect of this patch can be observed by running the following code: it creates a subthread that launches a posix cputimer which expires after 10 seconds. But then the subthread only busy loops for 2 seconds and exits. The parent reaps the subthread and read the timer value. Its expected value should the be the initial timer's expiration value minus the cputime elapsed in the subthread. Roughly 10 - 2 = 8 seconds: #include #include #include #include #include static timer_t id; static struct itimerspec val = { .it_value.tv_sec = 10, }, new; static void *thread(void *unused) { int err; struct timeval start, end, diff; timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return NULL; } /* Arm 10 sec timer */ err = timer_settime(id, 0, &val, NULL); if (err < 0) { perror("Can't set timer\n"); return NULL; } /* Exit after 2 seconds of execution */ gettimeofday(&start, NULL); do { gettimeofday(&end, NULL); timersub(&end, &start, &diff); } while (diff.tv_sec < 2); return NULL; } int main(int argc, char **argv) { pthread_t pthread; int err; err = pthread_create(&pthread, NULL, thread, NULL); if (err) { perror("Can't create thread\n"); return -1; } pthread_join(pthread, NULL); /* Just wait a little bit to make sure the child got reaped */ sleep(1); err = timer_gettime(id, &new); if (err) perror("Can't get timer value\n"); printf("%d %ld\n", new.it_value.tv_sec, new.it_value.tv_nsec); return 0; } Before the patch: $ ./posix_cpu_timers 6 2278074 After the patch: $ ./posix_cpu_timers 8 1158766 Before the patch, the elapsed time got two more seconds spuriously accounted. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4ebd8ad07c66..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -404,14 +404,8 @@ static void cleanup_timers_list(struct list_head *head, { struct cpu_timer_list *timer, *next; - list_for_each_entry_safe(timer, next, head, entry) { + list_for_each_entry_safe(timer, next, head, entry) list_del_init(&timer->entry); - if (timer->expires < curr) { - timer->expires = 0; - } else { - timer->expires -= curr; - } - } } /* @@ -459,15 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, unsigned long long now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires -= now; + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) -- cgit v1.2.3 From fa18f7bde3ad4568d1d343b60d963bfbd8dc3991 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 26 May 2013 17:35:41 -0400 Subject: posix-cpu-timers: don't account cpu timer after stopped thread runtime accounting When tsk->signal->cputimer->running is 1, signal->cputimer (i.e. per process timer account) and tsk->sum_sched_runtime (i.e. per thread timer account) increase at the same pace because update_curr() increases both accounting. However, there is one exception. When thread exiting, __exit_signal() turns over task's sum_shced_runtime to sig->sum_sched_runtime, but it doesn't stop signal->cputimer accounting. This inconsistency makes POSIX timer wake up too early. This patch fixes it. Original-patch-by: Olivier Langlois Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Olivier Langlois Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- kernel/sched/stats.h | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..71bac979d5ee 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -161,6 +161,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) * on CONFIG_SCHEDSTATS. */ +/** + * cputimer_running - return true if cputimer is running + * + * @tsk: Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return false; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return false; + + return true; +} + /** * account_group_user_time - Maintain utime for a thread group. * @@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); -- cgit v1.2.3 From e5302920da9ef23f9d19d4e9ac85704cc25bee7a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 5 Jul 2013 00:30:11 +0200 Subject: perf: Fix interrupt handler timing harness This patch fixes a serious bug in: 14c63f17b1fd perf: Drop sample rate when sampling is too slow There was an misunderstanding on the API of the do_div() macro. It returns the remainder of the division and this was not what the function expected leading to disabling the interrupt latency watchdog. This patch also remove a duplicate assignment in perf_sample_event_took(). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: dave.hansen@linux.intel.com Cc: ak@linux.intel.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/20130704223010.GA30625@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1db3af933704..1833bc5a84a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -182,7 +182,7 @@ void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - tmp = do_div(tmp, 100); + do_div(tmp, 100); atomic_set(&perf_sample_allowed_ns, tmp); } @@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; - u64 local_samples_len = __get_cpu_var(running_sample_length); + u64 local_samples_len; if (atomic_read(&perf_sample_allowed_ns) == 0) return; -- cgit v1.2.3 From 332962f2c88868ed3cdab466870baaa34dd58612 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Jul 2013 22:46:45 +0200 Subject: clocksource: Reselect clocksource when watchdog validated high-res capability Up to commit 5d33b883a (clocksource: Always verify highres capability) we had no sanity check when selecting a clocksource, which prevented that a non highres capable clocksource is used when the system already switched to highres/nohz mode. The new sanity check works as Alex and Tim found out. It prevents the TSC from being used. This happens because on x86 the boot process looks like this: tsc_start_freqency_validation(TSC); clocksource_register(HPET); clocksource_done_booting(); clocksource_select() Selects HPET which is valid for high-res switch_to_highres(); clocksource_register(TSC); TSC is not selected, because it is not yet flagged as VALID_HIGH_RES clocksource_watchdog() Validates TSC for highres, but that does not make TSC the current clocksource. Before the sanity check was added, we installed TSC unvalidated which worked most of the time. If the TSC was really detected as unstable, then the unstable logic removed it and installed HPET again. The sanity check is correct and needed. So the watchdog needs to kick a reselection of the clocksource, when it qualifies TSC as a valid high res clocksource. To solve this, we mark the clocksource which got the flag CLOCK_SOURCE_VALID_FOR_HRES set by the watchdog with an new flag CLOCK_SOURCE_RESELECT and trigger the watchdog thread. The watchdog thread evaluates the flag and invokes clocksource_select() when set. To avoid that the clocksource_done_booting() code, which is about to install the first real clocksource anyway, needs to go through clocksource_select and tick_oneshot_notify() pointlessly, split out the clocksource_watchdog_kthread() list walk code and invoke the select/notify only when called from clocksource_watchdog_kthread(). So clocksource_done_booting() can utilize the same splitout code without the select/notify invocation and the clocksource_mutex unlock/relock dance. Reported-and-tested-by: Alex Shi Cc: Hans Peter Anvin Cc: Tim Chen Cc: Andi Kleen Tested-by: Peter Zijlstra Cc: Ingo Molnar Cc: Davidlohr Bueso Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307042239150.11637@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 57 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e713ef7d19a7..50a8736757f3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -181,6 +181,7 @@ static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -301,13 +302,30 @@ static void clocksource_watchdog(unsigned long data) if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + /* - * We just marked the clocksource as highres-capable, - * notify the rest of the system as well so that we - * transition into high-res mode: + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. */ - tick_clock_notify(); + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } } } @@ -404,19 +422,25 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; LIST_HEAD(unstable); + int select = 0; - mutex_lock(&clocksource_mutex); spin_lock_irqsave(&watchdog_lock, flags); - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); list_add(&cs->wd_list, &unstable); + select = 1; + } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; } + } /* Check if the watchdog timer needs to be stopped. */ clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -426,6 +450,14 @@ static int clocksource_watchdog_kthread(void *data) list_del_init(&cs->wd_list); __clocksource_change_rating(cs, 0); } + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -445,7 +477,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -647,16 +679,11 @@ static int __init clocksource_done_booting(void) { mutex_lock(&clocksource_mutex); curr_clocksource = clocksource_default_clock(); - mutex_unlock(&clocksource_mutex); - finished_booting = 1; - /* * Run the watchdog first to eliminate unstable clock sources */ - clocksource_watchdog_kthread(NULL); - - mutex_lock(&clocksource_mutex); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -789,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - clocksource_select(); } /** @@ -801,6 +827,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); + clocksource_select(); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); -- cgit v1.2.3 From 002fca5df168922103a2bb52748f9984e6de80b2 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Fri, 5 Jul 2013 17:13:12 +0800 Subject: genirq: generic chip: Use DIV_ROUND_UP to calculate numchips The number of interrupts in a domain may be not divisible by the number of interrupts each chip handles. Integer division may truncate the result, thus use DIV_ROUND_UP to count numchips. Seems all users of irq_alloc_domain_generic_chips() in current code do not have this issue. I just found the issue while reading the code. Signed-off-by: Axel Lin Cc: Grant Likely Cc: Tony Lindgren Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1373015592.18252.2.camel@phoenix Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 1c39eccc1eaf..2f274f30b7e2 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -278,7 +278,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) return -EINVAL; - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = DIV_ROUND_UP(d->revmap_data.linear.size, irqs_per_chip); if (!numchips) return -EINVAL; -- cgit v1.2.3 From 5ec2481b7b47a4005bb446d176e5d0257400c77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 5 Jul 2013 12:09:18 +0200 Subject: hrtimers: Move SMP function call to thread context smp_call_function_* must not be called from softirq context. But clock_was_set() which calls on_each_cpu() is called from softirq context to implement a delayed clock_was_set() for the timer interrupt handler. Though that almost never gets invoked. A recent change in the resume code uses the softirq based delayed clock_was_set to support Xens resume mechanism. linux-next contains a new warning which warns if smp_call_function_* is called from softirq context which gets triggered by that Xen change. Fix this by moving the delayed clock_was_set() call to a work context. Reported-and-tested-by: Artem Savkov Reported-by: Sasha Levin Cc: David Vrabel Cc: Ingo Molnar Cc: H. Peter Anvin , Cc: Konrad Wilk Cc: John Stultz Cc: xen-devel@lists.xen.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e86827e94c9a..b9b9420a1297 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -721,17 +721,20 @@ static int hrtimer_switch_to_hres(void) return 1; } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + /* - * Called from timekeeping code to reprogramm the hrtimer interrupt - * device. If called from the timer interrupt context we defer it to - * softirq context. + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus. */ void clock_was_set_delayed(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - cpu_base->clock_was_set = 1; - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + schedule_work(&hrtimer_work); } #else @@ -775,12 +778,7 @@ void clock_was_set(void) * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred to the softirq. - * - * The one-shot timer has already been programmed to fire immediately - * (see tick_resume_oneshot()) and this interrupt will trigger the - * softirq to run early enough to correctly reprogram the timers on - * all CPUs. + * must be deferred. */ void hrtimers_resume(void) { @@ -789,8 +787,10 @@ void hrtimers_resume(void) WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - cpu_base->clock_was_set = 1; - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); } static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) @@ -1441,13 +1441,6 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - if (cpu_base->clock_was_set) { - cpu_base->clock_was_set = 0; - clock_was_set(); - } - hrtimer_peek_ahead_timers(); } -- cgit v1.2.3 From 73b0cd674ccc64c921e25bd7154f26d342116539 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Jul 2013 10:34:00 +0200 Subject: hrtimer: Remove unused variable Sigh, should have noticed myself. Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b9b9420a1297..3a951d8d5770 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -782,8 +782,6 @@ void clock_was_set(void) */ void hrtimers_resume(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); -- cgit v1.2.3 From 79f6530cb59e2a0af6953742a33cc29e98ca631c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2013 15:59:36 -0700 Subject: audit: fix mq_open and mq_unlink to add the MQ root as a hidden parent audit_names record The old audit PATH records for mq_open looked like this: type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 ...with the audit related changes that went into 3.7, they now look like this: type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq" Both of these look wrong to me. As Steve Grubb pointed out: "What we need is 1 PATH record that identifies the MQ. The other PATH records probably should not be there." Fix it to record the mq root as a parent, and flag it such that it should be hidden from view when the names are logged, since the root of the mq filesystem isn't terribly interesting. With this change, we get a single PATH record that looks more like this: type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914 dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmpfs_t:s0 In order to do this, a new audit_inode_parent_hidden() function is added. If we do it this way, then we avoid having the existing callers of audit_inode needing to do any sort of flag conversion if auditing is inactive. Signed-off-by: Jeff Layton Reported-by: Jiri Jaburek Cc: Steve Grubb Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; -- cgit v1.2.3 From 6beb8a23b50d38a003e80c5f16b50c56e8ae3387 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Mon, 8 Jul 2013 15:59:37 -0700 Subject: kernel/auditfilter.c: fixing build warning kernel/auditfilter.c:426: warning: this decimal constant is unsigned only in ISO C90 Signed-off-by: Raphael S. Carvalho Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..0ee9eff866d6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v1.2.3 From 2f992ee85aaa7dfd2bda43efe4493af1e108d054 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Jul 2013 15:59:38 -0700 Subject: kernel/auditfilter.c: fix leak in audit_add_rule() error path If both 'tree' and 'watch' are valid we must call audit_put_tree(), just like the preceding code within audit_add_rule(). Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0ee9eff866d6..3d15c66b7f0b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } -- cgit v1.2.3 From b9ce54c9f59894e787e3067d2f758c297fcd6fd0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 8 Jul 2013 15:59:39 -0700 Subject: audit: Fix decimal constant description Use proper decimal type for comparison with u32. Compilation warning was introduced by 780a7654 ("audit: Make testing for a valid loginuid explicit.") kernel/auditfilter.c: In function 'audit_data_to_entry': kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only in ISO C90 [enabled by default] if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { Signed-off-by: Michal Simek Cc: Al Viro Cc: Eric Paris Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3d15c66b7f0b..f7aee8be7fb2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v1.2.3 From dcb6b45254e2281b6f99ea7f2d51343954aa3ba8 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 8 Jul 2013 16:00:42 -0700 Subject: panic: add cpu/pid to warn_slowpath_common in WARNING printk()s Add the cpu/pid that called WARN() so that the stack traces can be matched up with the WARNING messages. [akpm@linux-foundation.org: remove stray quote] Signed-off-by: Alex Thorlton Reviewed-by: Robin Holt Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..97712319f128 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); -- cgit v1.2.3 From 7c8df28633bf0b7eb253f866029be0ac59ddb062 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:54 -0700 Subject: ptrace: revert "Prepare to fix racy accesses on task breakpoints" This reverts commit bf26c018490c ("Prepare to fix racy accesses on task breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers, we can kill them and remove task->ptrace_bp_refcnt. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/ptrace.c | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fafe75d9e6f6..a949819055d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -808,7 +808,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ba5e6cea181a..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.2.3 From fab840fc2d542fabcab903db8e03589a6702ba5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:05 -0700 Subject: ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child) Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child). This frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this ensures that the tracee won't be killed by SIGTRAP triggered by the active breakpoints. Test-case: unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len) { unsigned long dr7; dr7 = ((len | type) & 0xf) << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); if (enable) dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return dr7; } int write_dr(int pid, int dr, unsigned long val) { return ptrace(PTRACE_POKEUSER, pid, offsetof (struct user, u_debugreg[dr]), val); } void func(void) { } int main(void) { int pid, stat; unsigned long dr7; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); kill(getpid(), SIGHUP); func(); return 0x13; } assert(pid == waitpid(-1, &stat, 0)); assert(WSTOPSIG(stat) == SIGHUP); assert(write_dr(pid, 0, (long)func) == 0); dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1); assert(write_dr(pid, 7, dr7) == 0); assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0); assert(pid == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Before this patch the child is killed after PTRACE_DETACH. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..4041f5747e73 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* -- cgit v1.2.3 From 0efbee70890c992f31a7b294ac654ff6c62d51c5 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:31 -0700 Subject: reboot: remove -stable friendly PF_THREAD_BOUND define Remove the prior patch's #define for easier backporting to the stable releases. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 071de900c824..b882440bd0c0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,11 +362,6 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY PF_THREAD_BOUND -#endif - static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ -- cgit v1.2.3 From 15d94b82565ebfb0cf27830b96e6cf5ed2d12a9a Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:32 -0700 Subject: reboot: move shutdown/reboot related functions to kernel/reboot.c This patch is preparatory. It moves reboot related syscall, etc functions from kernel/sys.c to kernel/reboot.c. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/reboot.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 331 ----------------------------------------------------- 3 files changed, 347 insertions(+), 332 deletions(-) create mode 100644 kernel/reboot.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af9..470839d1a30e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ + notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 000000000000..37d2636a65c2 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/reboot.c + * + * Copyright (C) 2013 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); +} + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + migrate_to_reboot_cpu(); + syscore_shutdown(); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(pid_ns, cmd); + if (ret) + return ret; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ + char **argv; + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret; + + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + ret = -ENOMEM; + } + + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sys.c b/kernel/sys.c index b882440bd0c0..771129b299f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -115,20 +115,6 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. @@ -308,261 +294,6 @@ out_unlock: return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -static void migrate_to_reboot_cpu(void) -{ - /* The boot cpu is always logical cpu 0 */ - int cpu = 0; - - cpu_hotplug_disable(); - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(cpu)) - cpu = cpumask_first(cpu_online_mask); - - /* Prevent races with other tasks migrating this task */ - current->flags |= PF_NO_SETAFFINITY; - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - migrate_to_reboot_cpu(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt.\n"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -2287,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill -- cgit v1.2.3 From 972ee83df88a7fd84c228a31b4f9611299898984 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:34 -0700 Subject: reboot: checkpatch.pl the new kernel/reboot.c file Get the new file to pass scripts/checkpatch.pl Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 37d2636a65c2..abb6a0483716 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -4,6 +4,8 @@ * Copyright (C) 2013 Linus Torvalds */ +#define pr_fmt(fmt) "reboot: " fmt + #include #include #include @@ -114,9 +116,9 @@ void kernel_restart(char *cmd) migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); + pr_emerg("Restarting system\n"); else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + pr_emerg("Restarting system with command '%s'\n", cmd); kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } @@ -125,7 +127,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; usermodehelper_disable(); device_shutdown(); @@ -140,11 +142,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } - EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -159,7 +160,7 @@ void kernel_power_off(void) pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); + pr_emerg("Power down\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } @@ -188,10 +189,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* For safety, we require "magic" arguments. */ if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) + magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; /* @@ -234,7 +235,8 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); + if (ret < 0) { ret = -EFAULT; break; } @@ -300,14 +302,11 @@ static int __orderly_poweroff(bool force) ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); ret = -ENOMEM; } if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); + pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an -- cgit v1.2.3 From 1b3a5d02ee070c8f9943333b9b6370f486601e0f Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:42 -0700 Subject: reboot: move arch/x86 reboot= handling to generic kernel Merge together the unicore32, arm, and x86 reboot= command line parameter handling. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russell King Cc: Guan Xuetao Cc: Russ Anderson Cc: Robin Holt Acked-by: Ingo Molnar Acked-by: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index abb6a0483716..269ed9384cc4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) "reboot: " fmt +#include #include #include #include @@ -24,6 +25,18 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); +#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#define DEFAULT_REBOOT_MODE = REBOOT_HARD +#else +#define DEFAULT_REBOOT_MODE +#endif +enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; + +int reboot_default; +int reboot_cpu; +enum reboot_type reboot_type = BOOT_ACPI; +int reboot_force; + /* * If set, this is used for preparing the system to power off. */ @@ -87,7 +100,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ - int cpu = 0; + int cpu = reboot_cpu; cpu_hotplug_disable(); @@ -343,3 +356,64 @@ int orderly_poweroff(bool force) return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); + +static int __init reboot_setup(char *str) +{ + for (;;) { + /* + * Having anything passed on the command line via + * reboot= will cause us to disable DMI checking + * below. + */ + reboot_default = 0; + + switch (*str) { + case 'w': + reboot_mode = REBOOT_WARM; + break; + + case 'c': + reboot_mode = REBOOT_COLD; + break; + + case 'h': + reboot_mode = REBOOT_HARD; + break; + + case 's': + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else + reboot_mode = REBOOT_SOFT; + break; + + case 'g': + reboot_mode = REBOOT_GPIO; + break; + + case 'b': + case 'a': + case 'k': + case 't': + case 'e': + case 'p': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; + } + + str = strchr(str, ','); + if (str) + str++; + else + break; + } + return 1; +} +__setup("reboot=", reboot_setup); -- cgit v1.2.3 From 98d1e64f95b177d0f14efbdf695a1b28e1428035 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 10 Jul 2013 16:05:12 -0700 Subject: mm: remove free_area_cache Since all architectures have been converted to use vm_unmapped_area(), there is no remaining use for the free_area_cache. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: "James E.J. Bottomley" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Paul Mackerras Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6e6a1c11b3e5..66635c80a813 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); -- cgit v1.2.3 From 734df5ab549ca44f40de0f07af1c8803856dfb18 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 9 Jul 2013 17:44:10 +0200 Subject: perf: Clone child context from parent context pmu Currently when the child context for inherited events is created, it's based on the pmu object of the first event of the parent context. This is wrong for the following scenario: - HW context having HW and SW event - HW event got removed (closed) - SW event stays in HW context as the only event and its pmu is used to clone the child context The issue starts when the cpu context object is touched based on the pmu context object (__get_cpu_context). In this case the HW context will work with SW cpu context ending up with following WARN below. Fixing this by using parent context pmu object to clone from child context. Addresses the following warning reported by Vince Weaver: [ 2716.472065] ------------[ cut here ]------------ [ 2716.476035] WARNING: at kernel/events/core.c:2122 task_ctx_sched_out+0x3c/0x) [ 2716.476035] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl nfs locn [ 2716.476035] CPU: 0 PID: 3164 Comm: perf_fuzzer Not tainted 3.10.0-rc4 #2 [ 2716.476035] Hardware name: AOpen DE7000/nMCP7ALPx-DE R1.06 Oct.19.2012, BI2 [ 2716.476035] 0000000000000000 ffffffff8102e215 0000000000000000 ffff88011fc18 [ 2716.476035] ffff8801175557f0 0000000000000000 ffff880119fda88c ffffffff810ad [ 2716.476035] ffff880119fda880 ffffffff810af02a 0000000000000009 ffff880117550 [ 2716.476035] Call Trace: [ 2716.476035] [] ? warn_slowpath_common+0x5b/0x70 [ 2716.476035] [] ? task_ctx_sched_out+0x3c/0x5f [ 2716.476035] [] ? perf_event_exit_task+0xbf/0x194 [ 2716.476035] [] ? do_exit+0x3e7/0x90c [ 2716.476035] [] ? __do_fault+0x359/0x394 [ 2716.476035] [] ? do_group_exit+0x66/0x98 [ 2716.476035] [] ? get_signal_to_deliver+0x479/0x4ad [ 2716.476035] [] ? __perf_event_task_sched_out+0x230/0x2d1 [ 2716.476035] [] ? do_signal+0x3c/0x432 [ 2716.476035] [] ? ctx_sched_in+0x43/0x141 [ 2716.476035] [] ? perf_event_context_sched_in+0x7a/0x90 [ 2716.476035] [] ? __perf_event_task_sched_in+0x31/0x118 [ 2716.476035] [] ? mmdrop+0xd/0x1c [ 2716.476035] [] ? finish_task_switch+0x7d/0xa6 [ 2716.476035] [] ? do_notify_resume+0x20/0x5d [ 2716.476035] [] ? retint_signal+0x3d/0x78 [ 2716.476035] ---[ end trace 827178d8a5966c3d ]--- Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373384651-6109-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1833bc5a84a7..1d1f030e2f1e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7465,7 +7465,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = alloc_perf_context(event->pmu, child); + child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; -- cgit v1.2.3 From 06f417968beac6e6b614e17b37d347aa6a6b1d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 9 Jul 2013 17:44:11 +0200 Subject: perf: Remove WARN_ON_ONCE() check in __perf_event_enable() for valid scenario The '!ctx->is_active' check has a valid scenario, so there's no need for the warning. The reason is that there's a time window between the 'ctx->is_active' check in the perf_event_enable() function and the __perf_event_enable() function having: - IRQs on - ctx->lock unlocked where the task could be killed and 'ctx' deactivated by perf_event_exit_task(), ending up with the warning below. So remove the WARN_ON_ONCE() check and add comments to explain it all. This addresses the following warning reported by Vince Weaver: [ 324.983534] ------------[ cut here ]------------ [ 324.984420] WARNING: at kernel/events/core.c:1953 __perf_event_enable+0x187/0x190() [ 324.984420] Modules linked in: [ 324.984420] CPU: 19 PID: 2715 Comm: nmi_bug_snb Not tainted 3.10.0+ #246 [ 324.984420] Hardware name: Supermicro X8DTN/X8DTN, BIOS 4.6.3 01/08/2010 [ 324.984420] 0000000000000009 ffff88043fce3ec8 ffffffff8160ea0b ffff88043fce3f00 [ 324.984420] ffffffff81080ff0 ffff8802314fdc00 ffff880231a8f800 ffff88043fcf7860 [ 324.984420] 0000000000000286 ffff880231a8f800 ffff88043fce3f10 ffffffff8108103a [ 324.984420] Call Trace: [ 324.984420] [] dump_stack+0x19/0x1b [ 324.984420] [] warn_slowpath_common+0x70/0xa0 [ 324.984420] [] warn_slowpath_null+0x1a/0x20 [ 324.984420] [] __perf_event_enable+0x187/0x190 [ 324.984420] [] remote_function+0x40/0x50 [ 324.984420] [] generic_smp_call_function_single_interrupt+0xbe/0x130 [ 324.984420] [] smp_call_function_single_interrupt+0x27/0x40 [ 324.984420] [] call_function_single_interrupt+0x6f/0x80 [ 324.984420] [] ? _raw_spin_unlock_irqrestore+0x41/0x70 [ 324.984420] [] perf_event_exit_task+0x14d/0x210 [ 324.984420] [] ? switch_task_namespaces+0x24/0x60 [ 324.984420] [] do_exit+0x2b6/0xa40 [ 324.984420] [] ? _raw_spin_unlock_irq+0x2c/0x30 [ 324.984420] [] do_group_exit+0x49/0xc0 [ 324.984420] [] get_signal_to_deliver+0x254/0x620 [ 324.984420] [] do_signal+0x57/0x5a0 [ 324.984420] [] ? __do_page_fault+0x2a4/0x4e0 [ 324.984420] [] ? retint_restore_args+0xe/0xe [ 324.984420] [] ? retint_signal+0x11/0x84 [ 324.984420] [] do_notify_resume+0x65/0x80 [ 324.984420] [] retint_signal+0x46/0x84 [ 324.984420] ---[ end trace 442ec2f04db3771a ]--- Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Suggested-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373384651-6109-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d1f030e2f1e..ef5e7cc686e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1950,7 +1950,16 @@ static int __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - if (WARN_ON_ONCE(!ctx->is_active)) + /* + * There's a time window between 'ctx->is_active' check + * in perf_event_enable function and this place having: + * - IRQs on + * - ctx->lock unlocked + * + * where the task could be killed and 'ctx' deactivated + * by perf_event_exit_task. + */ + if (!ctx->is_active) return -EINVAL; raw_spin_lock(&ctx->lock); -- cgit v1.2.3 From 058ebd0eba3aff16b144eabf4510ed9510e1416e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Jul 2013 11:08:33 +0200 Subject: perf: Fix perf_lock_task_context() vs RCU Jiri managed to trigger this warning: [] ====================================================== [] [ INFO: possible circular locking dependency detected ] [] 3.10.0+ #228 Tainted: G W [] ------------------------------------------------------- [] p/6613 is trying to acquire lock: [] (rcu_node_0){..-...}, at: [] rcu_read_unlock_special+0xa7/0x250 [] [] but task is already holding lock: [] (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0xd9/0x2c0 [] [] which lock already depends on the new lock. [] [] the existing dependency chain (in reverse order) is: [] [] -> #4 (&ctx->lock){-.-...}: [] -> #3 (&rq->lock){-.-.-.}: [] -> #2 (&p->pi_lock){-.-.-.}: [] -> #1 (&rnp->nocb_gp_wq[1]){......}: [] -> #0 (rcu_node_0){..-...}: Paul was quick to explain that due to preemptible RCU we cannot call rcu_read_unlock() while holding scheduler (or nested) locks when part of the read side critical section was preemptible. Therefore solve it by making the entire RCU read side non-preemptible. Also pull out the retry from under the non-preempt to play nice with RT. Reported-by: Jiri Olsa Helped-out-by: Paul E. McKenney Cc: Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ef5e7cc686e3..eba8fb5834ae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -947,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; - rcu_read_lock(); retry: + /* + * One of the few rules of preemptible RCU is that one cannot do + * rcu_read_unlock() while holding a scheduler (or nested) lock when + * part of the read side critical section was preemptible -- see + * rcu_read_unlock_special(). + * + * Since ctx->lock nests under rq->lock we must ensure the entire read + * side critical section is non-preemptible. + */ + preempt_disable(); + rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* @@ -964,6 +974,8 @@ retry: raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); + rcu_read_unlock(); + preempt_enable(); goto retry; } @@ -973,6 +985,7 @@ retry: } } rcu_read_unlock(); + preempt_enable(); return ctx; } -- cgit v1.2.3 From 1b375dc30710180c4b88cc59caba6e3481ec5c8b Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 5 Jul 2013 09:29:32 +0200 Subject: mutex: Move ww_mutex definitions to ww_mutex.h Move the definitions for wound/wait mutexes out to a separate header, ww_mutex.h. This reduces clutter in mutex.h, and increases readability. Suggested-by: Linus Torvalds Signed-off-by: Maarten Lankhorst Acked-by: Peter Zijlstra Acked-by: Rik van Riel Acked-by: Maarten Lankhorst Cc: Dave Airlie Link: http://lkml.kernel.org/r/51D675DC.3000907@canonical.com [ Tidied up the code a bit. ] Signed-off-by: Ingo Molnar --- kernel/mutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index e581ada5faf4..ff05f4bd86eb 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -18,6 +18,7 @@ * Also see Documentation/mutex-design.txt. */ #include +#include #include #include #include -- cgit v1.2.3 From a272dcca1802a7e265a56e60b0d0a6715b0a8ac2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jul 2013 07:00:59 -0700 Subject: tick: broadcast: Check broadcast mode on CPU hotplug On ARM systems the dummy clockevent is registered with the cpu hotplug notifier chain before any other per-cpu clockevent. This has the side-effect of causing the dummy clockevent to be registered first in every hotplug sequence. Because the dummy is first, we'll try to turn the broadcast source on but the code in tick_device_uses_broadcast() assumes the broadcast source is in periodic mode and calls tick_broadcast_start_periodic() unconditionally. On boot this isn't a problem because we typically haven't switched into oneshot mode yet (if at all). During hotplug, if the broadcast source isn't in periodic mode we'll replace the broadcast oneshot handler with the broadcast periodic handler and start emulating oneshot mode when we shouldn't. Due to the way the broadcast oneshot handler programs the next_event it's possible for it to contain KTIME_MAX and cause us to hang the system when the periodic handler tries to program the next tick. Fix this by using the appropriate function to start the broadcast source. Reported-by: Stephen Warren Tested-by: Stephen Warren Signed-off-by: Stephen Boyd Cc: Mark Rutland Cc: Marc Zyngier Cc: ARM kernel mailing list Cc: John Stultz Cc: Joseph Lo Link: http://lkml.kernel.org/r/20130711140059.GA27430@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 6d3f91631de6..218bcb565fed 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -157,7 +157,10 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(bc); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); ret = 1; } else { /* -- cgit v1.2.3 From 971ee28cbd1ccd87b3164facd9359a534c1d2892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Jun 2013 11:18:53 +0200 Subject: sched: Fix HRTICK David reported that the HRTICK sched feature was borken; which was enough motivation for me to finally fix it ;-) We should not allow hrtimer code to do softirq wakeups while holding scheduler locks. The hrtimer code only needs this when we accidentally try to program an expired time. We don't much care about those anyway since we have the regular tick to fall back to. Reported-by: David Ahern Tested-by: David Ahern Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130628091853.GE29209@dyad.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e533b95..0d8eb4525e76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void) #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. */ static void hrtick_clear(struct rq *rq) @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = hrtimer_get_softexpires(timer); + + return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} + /* * called from hardirq (IPI) context */ @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg) struct rq *rq = arg; raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); + __hrtick_restart(rq); rq->hrtick_csd_pending = 0; raw_spin_unlock(&rq->lock); } @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); if (rq == this_rq()) { - hrtimer_restart(timer); + __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; -- cgit v1.2.3 From 913ffdb54366f94eec65c656cae8c6e00e1ab1b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 11 Jul 2013 16:34:48 -0700 Subject: cgroup: replace task_cgroup_path_from_hierarchy() with task_cgroup_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit task_cgroup_path_from_hierarchy() was added for the planned new users and none of the currently planned users wants to know about multiple hierarchies. This patch drops the multiple hierarchy part and makes it always return the path in the first non-dummy hierarchy. As unified hierarchy will always have id 1, this is guaranteed to return the path for the unified hierarchy if mounted; otherwise, it will return the path from the hierarchy which happens to occupy the lowest hierarchy id, which will usually be the first hierarchy mounted after boot. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Lennart Poettering Cc: Kay Sievers Cc: Jan Kaluža --- kernel/cgroup.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..afb8d53ca6c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1846,36 +1846,43 @@ out: EXPORT_SYMBOL_GPL(cgroup_path); /** - * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task - * @hierarchy_id: the hierarchy to look up @task's cgroup from * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and - * copy its path into @buf. This function grabs cgroup_mutex and shouldn't - * be used inside locks used by cgroup controller callbacks. + * Determine @task's cgroup on the first (the one with the lowest non-zero + * hierarchy_id) cgroup hierarchy and copy its path into @buf. This + * function grabs cgroup_mutex and shouldn't be used inside locks used by + * cgroup controller callbacks. + * + * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. */ -int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, - char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; - struct cgroup *cgrp = NULL; - int ret = -ENOENT; + struct cgroup *cgrp; + int hierarchy_id = 1, ret = 0; + + if (buflen < 2) + return -ENAMETOOLONG; mutex_lock(&cgroup_mutex); - root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); + root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); + if (root) { cgrp = task_cgroup_from_root(task, root); ret = cgroup_path(cgrp, buf, buflen); + } else { + /* if no hierarchy exists, everyone is in "/" */ + memcpy(buf, "/", 2); } mutex_unlock(&cgroup_mutex); - return ret; } -EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); +EXPORT_SYMBOL_GPL(task_cgroup_path); /* * Control Group taskset -- cgit v1.2.3 From 786e1448d9c5d2a469bcc9d2aecacd418ee1aca0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jul 2013 17:50:23 +0400 Subject: cgroup: we can use simple_lookup() now Signed-off-by: Al Viro --- kernel/cgroup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..0e0b20b8c5db 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -802,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, unsigned long subsys_mask); @@ -2642,7 +2641,7 @@ static const struct inode_operations cgroup_file_inode_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, + .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2652,14 +2651,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -{ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; -} - /* * Check if a file is a control file */ -- cgit v1.2.3 From e5248a111bf4048a9f3fab1a9c94c4630a10592a Mon Sep 17 00:00:00 2001 From: Liu ShuoX Date: Thu, 11 Jul 2013 16:03:45 +0800 Subject: PM / Sleep: avoid 'autosleep' in shutdown progress Prevent automatic system suspend from happening during system shutdown by making try_to_suspend() check system_state and return immediately if it is not SYSTEM_RUNNING. This prevents the following breakage from happening (scenario from Zhang Yanmin): Kernel starts shutdown and calls all device driver's shutdown callback. When a driver's shutdown is called, the last wakelock is released and suspend-to-ram starts. However, as some driver's shut down callbacks already shut down devices and disabled runtime pm, the suspend-to-ram calls driver's suspend callback without noticing that device is already off and causes crash. [rjw: Changelog] Signed-off-by: Liu ShuoX Cc: 3.5+ Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9a..9012ecf7b814 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work) mutex_lock(&autosleep_lock); - if (!pm_save_wakeup_count(initial_count)) { + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { mutex_unlock(&autosleep_lock); goto out; } -- cgit v1.2.3 From 49fb4c6290c70c418a5c25eee996d6b55ea132d6 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Jun 2013 14:52:21 -0400 Subject: rcu: delete __cpuinit usage from all rcu files The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the drivers/rcu uses of the __cpuinit macros from all C files. [1] https://lkml.org/lkml/2013/5/20/589 Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Dipankar Sarma Reviewed-by: Josh Triplett Signed-off-by: Paul Gortmaker --- kernel/rcutorture.c | 6 +++--- kernel/rcutree.c | 6 +++--- kernel/rcutree.h | 4 ++-- kernel/rcutree_plugin.h | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b1fa5510388d..f4871e52c546 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1476,7 +1476,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int __cpuinit +static int rcu_torture_onoff(void *arg) { int cpu; @@ -1558,7 +1558,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int __cpuinit +static int rcu_torture_onoff_init(void) { int ret; @@ -1601,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void) * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. */ -static int __cpuinit rcu_torture_stall(void *args) +static int rcu_torture_stall(void *args) { unsigned long stop_at; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e08abb9461ac..068de3a93606 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2910,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * can accept some slop in the rsp->completed access due to the fact * that this CPU cannot possibly have any RCU callbacks in flight yet. */ -static void __cpuinit +static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; @@ -2962,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) mutex_unlock(&rsp->onoff_mutex); } -static void __cpuinit rcu_prepare_cpu(int cpu) +static void rcu_prepare_cpu(int cpu) { struct rcu_state *rsp; @@ -2974,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4a39d364493c..b3832581043c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -521,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63098a59216e..769e12e3151b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1352,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; @@ -1507,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void) } early_initcall(rcu_spawn_kthreads); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; @@ -1549,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void) } early_initcall(rcu_scheduler_really_started); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { } -- cgit v1.2.3 From 0db0628d90125193280eabb501c94feaf48fa9ab Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Jun 2013 14:53:51 -0400 Subject: kernel: delete __cpuinit usage from all core kernel files The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the uses of the __cpuinit macros from C files in the core kernel directories (kernel, init, lib, mm, and include) that don't really have a specific maintainer. [1] https://lkml.org/lkml/2013/5/20/589 Signed-off-by: Paul Gortmaker --- kernel/cpu.c | 6 +++--- kernel/events/core.c | 4 ++-- kernel/fork.c | 2 +- kernel/hrtimer.c | 6 +++--- kernel/printk.c | 2 +- kernel/profile.c | 2 +- kernel/relay.c | 2 +- kernel/sched/core.c | 12 ++++++------ kernel/sched/fair.c | 2 +- kernel/smp.c | 2 +- kernel/smpboot.c | 2 +- kernel/softirq.c | 8 ++++---- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 10 +++++----- kernel/workqueue.c | 4 ++-- 15 files changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 198a38883e64..b2b227b82123 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen) { int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -419,7 +419,7 @@ out: return ret; } -int __cpuinit cpu_up(unsigned int cpu) +int cpu_up(unsigned int cpu) { int err = 0; @@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init); * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -void __cpuinit notify_cpu_starting(unsigned int cpu) +void notify_cpu_starting(unsigned int cpu) { unsigned long val = CPU_STARTING; diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..f3e9dce39bc9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7630,7 +7630,7 @@ static void __init perf_event_init_all_cpus(void) } } -static void __cpuinit perf_event_init_cpu(int cpu) +static void perf_event_init_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -7719,7 +7719,7 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int __cpuinit +static int perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..403d2bb8a968 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1546,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links) } } -struct task_struct * __cpuinit fork_idle(int cpu) +struct task_struct *fork_idle(int cpu) { struct task_struct *task; task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f0f4fe29cd21..383319bae3f7 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void __cpuinit init_hrtimers_cpu(int cpu) +static void init_hrtimers_cpu(int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, +static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int scpu = (long)hcpu; @@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hrtimers_nb = { +static struct notifier_block hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/printk.c b/kernel/printk.c index d37d45c90ae6..69b0890ed7e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1921,7 +1921,7 @@ void resume_console(void) * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int __cpuinit console_cpu_notify(struct notifier_block *self, +static int console_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { diff --git a/kernel/profile.c b/kernel/profile.c index 0bf400737660..6631e1ef55ab 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -331,7 +331,7 @@ out: put_cpu(); } -static int __cpuinit profile_cpu_callback(struct notifier_block *info, +static int profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..5001c9887db1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan, * * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, +static int relay_hotcpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..b7c32cb7bfeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4133,7 +4133,7 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle) { idle->sched_class = &idle_sched_class; } @@ -4146,7 +4146,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -4630,7 +4630,7 @@ static void set_rq_offline(struct rq *rq) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit +static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; @@ -4684,12 +4684,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * happens before everything else. This has to be lower priority than * the notifier in the perf_event subsystem, though. */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = { .notifier_call = migration_call, .priority = CPU_PRI_MIGRATION, }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { @@ -4702,7 +4702,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, } } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..bb456f44b7b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5506,7 +5506,7 @@ void nohz_balance_enter_idle(int cpu) set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, +static int sched_ilb_notifier(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..fe9f773d7114 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { +static struct notifier_block hotplug_cfd_notifier = { .notifier_call = hotplug_cfd, }; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 02fc5c933673..eb89e1807408 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -24,7 +24,7 @@ */ static DEFINE_PER_CPU(struct task_struct *, idle_threads); -struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +struct task_struct *idle_thread_get(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index ca25e6e704a2..be3d3514c325 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) } EXPORT_SYMBOL(send_remote_softirq); -static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +static int remote_softirq_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { /* @@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +static struct notifier_block remote_softirq_cpu_notifier = { .notifier_call = remote_softirq_cpu_notify, }; @@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit cpu_callback(struct notifier_block *nfb, +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69601726a745..e80183f4a6c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -298,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/kernel/timer.c b/kernel/timer.c index 15bc1b41021d..4296d13db3d1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1505,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int __cpuinit init_timers_cpu(int cpu) +static int init_timers_cpu(int cpu) { int j; struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; + static char tvec_base_done[NR_CPUS]; if (!tvec_base_done[cpu]) { static char boot_done; @@ -1577,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea } } -static void __cpuinit migrate_timers(int cpu) +static void migrate_timers(int cpu) { struct tvec_base *old_base; struct tvec_base *new_base; @@ -1610,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit timer_cpu_notify(struct notifier_block *self, +static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1635,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata timers_nb = { +static struct notifier_block timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..0b72e816b8d0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4644,7 +4644,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -4697,7 +4697,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { -- cgit v1.2.3 From 991821c86c2fb6cc4104ce679247864dbc070a83 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:34 +0800 Subject: tracing: Use correct config guard CONFIG_STACK_TRACER We should use CONFIG_STACK_TRACER to guard readme text of stack tracer related file, not CONFIG_STACKTRACE. Link: http://lkml.kernel.org/r/51E3B3A2.8080609@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cd500bffd9b..25b91afc29e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3537,14 +3537,14 @@ static const char readme_msg[] = "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" "\t\t\t Read the contents for more information\n" #endif -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" #endif -#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_STACK_TRACER */ ; static ssize_t -- cgit v1.2.3 From b9b3259746d77f4fcb786e2a43c25bcc40773755 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Jul 2013 16:05:51 -0700 Subject: sysfs.h: add __ATTR_RW() macro A number of parts of the kernel created their own version of this, might as well have the sysfs core provide it instead. Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..dd9878029d1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6234,8 +6234,6 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } -#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) - static struct device_attribute pmu_dev_attrs[] = { __ATTR_RO(type), __ATTR_RW(perf_event_mux_interval_ms), -- cgit v1.2.3 From 146c3442f2dd0f50d9431aea5d0d10dfd97c9999 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:44 +0800 Subject: tracing: Use trace_seq_puts()/trace_seq_putc() where possible For string without format specifiers, use trace_seq_puts() or trace_seq_putc(). Link: http://lkml.kernel.org/r/51E3B3AC.1000605@huawei.com Signed-off-by: zhangwei(Jovi) [ fixed a trace_seq_putc(s, " ") to trace_seq_putc(s, ' ') ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++---- kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_functions_graph.c | 52 ++++++++++++++++++------------------ kernel/trace/trace_mmiotrace.c | 8 +++--- kernel/trace/trace_output.c | 14 +++++----- kernel/trace/trace_syscalls.c | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e444ff88f0a4..eef2e566b2e7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "# compressed entry header\n"); - ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_printf(s, "\tarray : 32 bits\n"); - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_puts(s, "# compressed entry header\n"); + ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_puts(s, "\tarray : 32 bits\n"); + ret = trace_seq_putc(s, '\n'); ret = trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0d883dc057d6..0c7b75a8acc8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -646,7 +646,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_puts(s, "none\n"); mutex_unlock(&event_mutex); } @@ -660,7 +660,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8388bc99f2ee..d56ae9bae00b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* First spaces to align center */ for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* Last spaces to align center */ for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, " ------------------------------------------\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " => "); + ret = trace_seq_puts(s, " => "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, "\n ------------------------------------------\n\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return ret; if (type == TRACE_GRAPH_ENT) - ret = trace_seq_printf(s, "==========>"); + ret = trace_seq_puts(s, "==========>"); else - ret = trace_seq_printf(s, "<=========="); + ret = trace_seq_puts(s, "<=========="); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) len += strlen(nsecs_str); } - ret = trace_seq_printf(s, " us "); + ret = trace_seq_puts(s, " us "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Print remaining spaces to fit the row's width */ for (i = len; i < 7; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, /* No real adata, just filling the column with spaces */ switch (duration) { case DURATION_FILL_FULL: - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_START: - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_END: - ret = trace_seq_printf(s, " |"); + ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) - ret = trace_seq_printf(s, "! "); + ret = trace_seq_puts(s, "! "); /* Duration exceeded 10 msecs */ else if (duration > 10000ULL) - ret = trace_seq_printf(s, "+ "); + ret = trace_seq_puts(s, "+ "); } /* @@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, * to fill out the space. */ if (ret == -1) - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); /* Catching here any failure happenned above */ if (!ret) @@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "| "); + ret = trace_seq_puts(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. */ if (func_match) { - ret = trace_seq_printf(s, "}\n"); + ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { @@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, /* Indentation */ if (depth > 0) for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* The comment */ - ret = trace_seq_printf(s, "/* "); + ret = trace_seq_puts(s, "/* "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, s->len--; } - ret = trace_seq_printf(s, " */\n"); + ret = trace_seq_puts(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a5e8f4878bfa..b3dcfb2f0fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) if (drv) ret += trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_printf(s, " \n"); + ret += trace_seq_puts(s, " \n"); return ret; } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) struct header_iter *hiter; struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION 20070824\n"); + trace_seq_puts(s, "VERSION 20070824\n"); hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); if (!hiter) @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_printf(s, "rw what?\n"); + ret = trace_seq_puts(s, "rw what?\n"); break; } if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_printf(s, "map what?\n"); + ret = trace_seq_puts(s, "map what?\n"); break; } if (ret) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bb922d9ee51b..34e7cbac0c9c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_puts(s, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ret) ret = trace_seq_puts(s, "??"); if (ret) - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); } if (mm) @@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) int ret; if (!ip) - return trace_seq_printf(s, "0"); + return trace_seq_putc(s, '0'); if (sym_flags & TRACE_ITER_SYM_OFFSET) ret = seq_print_sym_offset(s, "%s", ip); @@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, goto partial; if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_printf(s, " <-")) + if (!trace_seq_puts(s, " <-")) goto partial; if (!seq_print_ip_sym(s, field->parent_ip, flags)) goto partial; } - if (!trace_seq_printf(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; return TRACE_TYPE_HANDLED; @@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, if (!seq_print_ip_sym(s, *p, flags)) goto partial; - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 322e16461072..061156215721 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, entry = syscall_nr_to_meta(syscall); if (!entry) { - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); return TRACE_TYPE_HANDLED; } -- cgit v1.2.3 From d611851b421731e2afd9cb956daae001af57a423 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:50 +0800 Subject: tracing: Typo fix on ring buffer comments There have some mismatch between comments with real function name, update it. This patch also add some missed function arguments description. Link: http://lkml.kernel.org/r/51E3B3B2.4080307@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eef2e566b2e7..cc2f66f68dc5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, } /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not @@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self, #endif /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work) * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. + * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * BUF_PAGE_SIZE. * @@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * expected. * * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); * an intervening ring_buffer_read_prepare_sync must have been * performed. * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables the recording to the buffer, and frees the @@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions @@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_read_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); -- cgit v1.2.3 From b8ebfd3f7113b63dda93d76bfec638c00e6bd514 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:04 +0200 Subject: tracing/function: Avoid perf_trace_buf_*() if event_function.perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit(head, task => NULL) make no sense if hlist_empty(head). Change perf_ftrace_function_call() to check event_function.perf_events beforehand. Link: http://lkml.kernel.org/r/20130617170204.GA19803@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045faba..12df5573086e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -266,6 +266,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct pt_regs regs; int rctx; + head = this_cpu_ptr(event_function.perf_events); + if (hlist_empty(head)) + return; + #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) @@ -279,8 +283,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - - head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 1, ®s, head, NULL); -- cgit v1.2.3 From 421c7860c6e1989da3962fafdd6699316c9f8e20 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:07 +0200 Subject: tracing/syscall: Avoid perf_trace_buf_*() if sys_data->perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit(head, task => NULL) make no sense if hlist_empty(head). Change perf_syscall_enter/exit() to check sys_data->{enter,exit}_event->perf_events beforehand. Link: http://lkml.kernel.org/r/20130617170207.GA19806@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 061156215721..ac0085777fbd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -566,6 +566,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* get the size after alignment with the u32 buffer size field */ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); @@ -583,8 +587,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - - head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } @@ -642,6 +644,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) + return; + /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -661,8 +667,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - - head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -- cgit v1.2.3 From cd92bf61d6d70bd3eb33b46d600e3f3eb9c5778a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:11 +0200 Subject: tracing/perf: Move the PERF_MAX_TRACE_SIZE check into perf_trace_buf_prepare() Every perf_trace_buf_prepare() caller does WARN_ONCE(size > PERF_MAX_TRACE_SIZE, message) and "message" is almost the same. Shift this WARN_ONCE() into perf_trace_buf_prepare(). This changes the meaning of _ONCE, but I think this is fine. - 4947014 2932448 10104832 17984294 1126b26 vmlinux + 4948422 2932448 10104832 17985702 11270a6 vmlinux on my build. Link: http://lkml.kernel.org/r/20130617170211.GA19813@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 ++++ kernel/trace/trace_kprobe.c | 6 ------ kernel/trace/trace_syscalls.c | 12 ------------ kernel/trace/trace_uprobe.c | 2 -- 4 files changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 12df5573086e..80c36bcf66e8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return NULL; + pc = preempt_count(); *rctxp = perf_swevent_get_recursion_context(); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ed6976493c8..ae6ce835b023 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1087,9 +1087,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) @@ -1120,9 +1117,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ac0085777fbd..8fd03657bc7d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -575,10 +575,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) - return; - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->event.type, regs, &rctx); if (!rec) @@ -652,14 +648,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - /* - * Impossible, but be paranoid with the future - * How to put this check outside runtime? - */ - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "exit event has grown above perf buffer size")) - return; - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->event.type, regs, &rctx); if (!rec) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d5d0cd368a56..a23d2d71188e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -818,8 +818,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; preempt_disable(); head = this_cpu_ptr(call->perf_events); -- cgit v1.2.3 From a232e270dcb55a70ad3241bc6fc160fd9b5c9e6c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 9 Jul 2013 18:35:26 +0900 Subject: tracing/kprobe: Wait for disabling all running kprobe handlers Wait for disabling all running kprobe handlers when a kprobe event is disabled, since the caller, trace_remove_event_call() supposes that a removing event is disabled completely by disabling the event. With this change, ftrace can ensure that there is no running event handlers after disabling it. Link: http://lkml.kernel.org/r/20130709093526.20138.93100.stgit@mhiramat-M0-7522 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ae6ce835b023..3811487e7a7a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -243,11 +243,11 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) static int disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { + struct event_file_link *link = NULL; + int wait = 0; int ret = 0; if (file) { - struct event_file_link *link; - link = find_event_file_link(tp, file); if (!link) { ret = -EINVAL; @@ -255,10 +255,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } list_del_rcu(&link->list); - /* synchronize with kprobe_trace_func/kretprobe_trace_func */ - synchronize_sched(); - kfree(link); - + wait = 1; if (!list_empty(&tp->files)) goto out; @@ -271,8 +268,22 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); + wait = 1; } out: + if (wait) { + /* + * Synchronize with kprobe_trace_func/kretprobe_trace_func + * to ensure disabled (all running handlers are finished). + * This is not only for kfree(), but also the caller, + * trace_remove_event_call() supposes it for releasing + * event_call related objects, which will be accessed in + * the kprobe_trace_func/kretprobe_trace_func. + */ + synchronize_sched(); + kfree(link); /* Ignored if link == NULL */ + } + return ret; } -- cgit v1.2.3 From 609e85a70bcd0eedf4ec60639dbcfb1ab011e054 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Wed, 10 Jul 2013 17:34:34 -0700 Subject: tracing: Fix error handling to ensure instances can always be removed Remove debugfs directories for tracing instances during creation if an error occurs causing the trace_array for that instance to not be added to ftrace_trace_arrays. If the directory continues to exist after the error, it cannot be removed because the respective trace_array is not in ftrace_trace_arrays. Link: http://lkml.kernel.org/r/1373502874-1706-2-git-send-email-azl@google.com Cc: stable@vger.kernel.org # 3.10 Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25b91afc29e0..7c3da7bca05b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5973,8 +5973,10 @@ static int new_instance_create(const char *name) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); - if (ret) + if (ret) { + debugfs_remove_recursive(tr->dir); goto out_free_tr; + } init_tracer_debugfs(tr, tr->dir); -- cgit v1.2.3 From f77d09a384676bde6445413949d9d2c508ff3e62 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Thu, 18 Jul 2013 11:18:44 -0700 Subject: tracing: Miscellaneous fixes for trace_array ref counting Some error paths did not handle ref counting properly, and some trace files need ref counting. Link: http://lkml.kernel.org/r/1374171524-11948-1-git-send-email-azl@google.com Cc: stable@vger.kernel.org # 3.10 Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++++++------ kernel/trace/trace_events.c | 21 +++++++++++++++++++-- 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c3da7bca05b..7d9ceab42564 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3008,7 +3008,6 @@ static int tracing_release(struct inode *inode, struct file *file) iter = m->private; tr = iter->tr; - trace_array_put(tr); mutex_lock(&trace_types_lock); @@ -3023,6 +3022,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); + + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -3447,6 +3449,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; + int ret; if (tracing_disabled) return -ENODEV; @@ -3454,7 +3457,11 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) if (trace_array_get(tr) < 0) return -ENODEV; - return single_open(file, tracing_trace_options_show, inode->i_private); + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } static const struct file_operations tracing_iter_fops = { @@ -3958,6 +3965,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; + __trace_array_put(tr); goto out; } @@ -4704,21 +4712,24 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) - return -ENOMEM; + goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); - return -ENOMEM; + goto out; } + ret = 0; + iter->tr = tr; iter->trace_buffer = &tc->tr->max_buffer; iter->cpu_file = tc->cpu; m->private = iter; file->private_data = m; } - +out: if (ret < 0) trace_array_put(tr); @@ -5328,9 +5339,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tc, .read = tracing_stats_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tc, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7d854290bf81..7a75cb22eab7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1218,6 +1218,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, @@ -1245,7 +1246,7 @@ static const struct file_operations ftrace_set_event_fops = { .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, - .release = seq_release, + .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { @@ -1323,6 +1324,15 @@ ftrace_event_open(struct inode *inode, struct file *file, return ret; } +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file) { @@ -1336,12 +1346,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); - return ftrace_event_open(inode, file, seq_ops); + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; } static struct event_subsystem * -- cgit v1.2.3 From 8f768993394a8c0d3801033c11fd86ce8c88dcac Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 18 Jul 2013 14:41:51 -0400 Subject: tracing: Add ref_data to function and fgraph tracer structs The selftest for function and function graph tracers are defined as __init, as they are only executed at boot up. The "tracer" structs that are associated to those tracers are not setup as __init as they are used after boot. To stop mismatch warnings, those structures need to be annotated with __ref_data. Currently, the tracer structures are defined to __read_mostly, as they do not really change. But in the future they should be converted to consts, but that will take a little work because they have a "next" pointer that gets updated when they are registered. That will have to wait till the next major release. Link: http://lkml.kernel.org/r/1373596735.17876.84.camel@gandalf.local.home Reported-by: kbuild test robot Reported-by: Chen Gang Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a4f6e1828b6..57b7bb0d39b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b863f93b30f3..38fe1483c508 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) return 0; } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data = { .name = "function", .init = function_trace_init, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56ae9bae00b..b5c09242683d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = { .funcs = &graph_functions }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = { .name = "function_graph", .open = graph_trace_open, .pipe_open = graph_trace_open, -- cgit v1.2.3 From 7710b639953b791610f0022a7d52d9801c93b969 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 18 Jul 2013 20:47:10 +0200 Subject: tracing: Simplify the iteration logic in f_start/f_next f_next() looks overcomplicated, and it is not strictly correct even if this doesn't matter. Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF) if trace_get_fields() returns an empty list, we should simply advance to FORMAT_PRINTFMT as we do when we find the end of list. 1. Change f_next() to return "struct list_head *" rather than "ftrace_event_field *", and change f_show() to do list_entry(). This simplifies the code a bit, only f_show() needs to know about ftrace_event_field, and f_next() can play with ->prev directly 2. Change f_next() to not play with ->prev / return inside the switch() statement. It can simply set node = head/common_head, the prev-or-advance-to-the-next-magic below does all work. While at it. f_start() looks overcomplicated too. I don't think *pos == 0 makes sense as a separate case, just change this code to do "while" instead of "do/while". The patch also moves f_start() down, close to f_stop(). This is purely cosmetic, just to make the locking added by the next patch more clear/visible. Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 60 +++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a75cb22eab7..76defd91f9b4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -826,59 +826,33 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); + struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - if (unlikely(list_empty(common_head))) - return NULL; - - field = list_entry(common_head->prev, - struct ftrace_event_field, link); - return field; + node = common_head; + break; case FORMAT_FIELD_SEPERATOR: - if (unlikely(list_empty(head))) - return NULL; - - field = list_entry(head->prev, struct ftrace_event_field, link); - return field; + node = head; + break; case FORMAT_PRINTFMT: /* all done */ return NULL; } - field = v; - if (field->link.prev == common_head) + node = node->prev; + if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; - else if (field->link.prev == head) + else if (node == head) return (void *)FORMAT_PRINTFMT; - - field = list_entry(field->link.prev, struct ftrace_event_field, link); - - return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ - loff_t l = 0; - void *p; - - /* Start by showing the header */ - if (!*pos) - return (void *)FORMAT_HEADER; - - p = (void *)FORMAT_HEADER; - do { - p = f_next(m, p, &l); - } while (p && l < *pos); - - return p; + else + return node; } static int f_show(struct seq_file *m, void *v) @@ -904,8 +878,7 @@ static int f_show(struct seq_file *m, void *v) return 0; } - field = v; - + field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: @@ -932,6 +905,17 @@ static int f_show(struct seq_file *m, void *v) return 0; } +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + static void f_stop(struct seq_file *m, void *p) { } -- cgit v1.2.3 From cd458ba9d5a5592d37b5145e560071e91ea762ac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 18 Jul 2013 20:47:12 +0200 Subject: tracing: Do not (ab)use trace_seq in event_id_read() event_id_read() has no reason to kmalloc "struct trace_seq" (more than PAGE_SIZE!), it can use a small buffer instead. Note: "if (*ppos) return 0" looks strange and even wrong, simple_read_from_buffer() handles ppos != 0 case corrrectly. And it seems that almost every user of trace_seq in this file should be converted too. Unless you use seq_open(), trace_seq buys nothing compared to the raw buffer, but it needs a bit more memory and code. Link: http://lkml.kernel.org/r/20130718184712.GA4786@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 76defd91f9b4..898f868833f2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -947,23 +947,14 @@ static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; + char buf[32]; + int len; if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->event.type); - - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - kfree(s); - return r; + len = sprintf(buf, "%d\n", call->event.type); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t -- cgit v1.2.3 From a644a7e9587802eabb2e229177606f6a74a60fc1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jul 2013 16:20:36 +0200 Subject: tracing: Kill trace_array->waiter Trivial. trace_array->waiter has no users since 6eaaa5d5 "tracing/core: use appropriate waiting on trace_pipe". Link: http://lkml.kernel.org/r/20130719142036.GA1594@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57b7bb0d39b7..e7d643b8a907 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -214,7 +214,6 @@ struct trace_array { struct dentry *event_dir; struct list_head systems; struct list_head events; - struct task_struct *waiter; int ref; }; -- cgit v1.2.3 From e70e78e3c83b536730e31231dd9b979768d8df3c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jul 2013 17:36:44 +0200 Subject: tracing: Kill the unbalanced tr->ref++ in tracing_buffers_open() tracing_buffers_open() does trace_array_get() and then it wrongly inrcements tr->ref again under trace_types_lock. This means that every caller leaks trace_array: # cd /sys/kernel/debug/tracing/ # mkdir instances/X # true < instances/X/per_cpu/cpu0/trace_pipe_raw # rmdir instances/X rmdir: failed to remove `instances/X': Device or resource busy Link: http://lkml.kernel.org/r/20130719153644.GA18899@redhat.com Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d9ceab42564..3f2477713aca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4959,8 +4959,6 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - tr->ref++; - info->iter.tr = tr; info->iter.cpu_file = tc->cpu; info->iter.trace = tr->current_trace; -- cgit v1.2.3 From 42577ca8c3616baaafdd8f167b2e1fb959026081 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Jul 2013 16:49:24 +0100 Subject: Fix __wait_on_atomic_t() to call the action func if the counter != 0 Fix __wait_on_atomic_t() so that it calls the action func if the counter != 0 rather than if the counter is 0 so as to be analogous to __wait_on_bit(). Thanks to Yacine who found this by visual inspection. This will affect FS-Cache in that it will could fail to sleep correctly when trying to clean up after a netfs cookie is withdrawn. Reported-by: Yacine Belkadi Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Milosz Tanski Signed-off-by: Linus Torvalds --- kernel/wait.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index ce0daa320a26..dec68bd4e9d8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait(wq, &q->wait, mode); val = q->key.flags; if (atomic_read(val) == 0) - ret = (*action)(val); + break; + ret = (*action)(val); } while (!ret && atomic_read(val) != 0); finish_wait(wq, &q->wait); return ret; -- cgit v1.2.3