From d631097577f6fe027f4903f62eabced6445d19bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 31 Oct 2011 11:07:42 +0200 Subject: tracing: fix event_subsystem ref counting Fix a bug introduced by e9dbfae5, which prevents event_subsystem from ever being released. Ref_count was added to keep track of subsystem users, not for counting events. Subsystem is created with ref_count = 1, so there is no need to increment it for every event, we have nr_events for that. Fix this by touching ref_count only when we actually have a new user - subsystem_open(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Link: http://lkml.kernel.org/r/1320052062-7846-1-git-send-email-idryomov@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } -- cgit v1.2.3 From ed0449af5373abd766c79fbf83254bebc996bd23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d074979..86040d9c1acc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit v1.2.3 From 49aa29513ec995f201664cf6eee36e5326ed38bf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 16:46:46 -0400 Subject: tracing: Add boiler plate for subsystem filter The system filter can be used to set multiple event filters that exist within the system. But currently it displays the last filter written that does not necessarily correspond to the filters within the system. The system filter itself is not used to filter any events. The system filter is just a means to set filters of the events within it. Because this causes an ambiguous state when the system filter reads a filter string but the events within the system have different strings it is best to just show a boiler plate: ### global filter ### # Use this to set filters for multiple events. # Only events with the given fields will be affected. # If no events are modified, an error message will be displayed here. If an error occurs while writing to the system filter, the system filter will replace the boiler plate with the error message as it currently does. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 86040d9c1acc..fdc6d22d406b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@ #include "trace.h" #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + enum filter_op_ids { OP_OR, @@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - replace_filter_string(filter, filter_string); + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) { - append_filter_err(ps, system->filter); - goto out; - } + if (err) + goto err_filter; err = replace_system_preds(system, ps, filter_string); if (err) - append_filter_err(ps, system->filter); + goto err_filter; out: filter_opstack_clear(ps); @@ -1865,6 +1872,11 @@ out_unlock: mutex_unlock(&event_mutex); return err; + +err_filter: + replace_filter_string(filter, filter_string); + append_filter_err(ps, system->filter); + goto out; } #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From 8ee3c92b7f2751c392be2d8fc360a410480b8757 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 10:45:23 -0400 Subject: ftrace: Remove force undef config value left for testing A forced undef of a config value was used for testing and was accidently left in during the final commit. This causes x86 to run slower than needed while running function tracing as well as causes the function graph selftest to fail when DYNMAIC_FTRACE is not set. This is because the code in MCOUNT expects the ftrace code to be processed with the config value set that happened to be forced not set. The forced config option was left in by: commit 6331c28c962561aee59e5a493b7556a4bb585957 ftrace: Fix dynamic selftest failure on some archs Link: http://lkml.kernel.org/r/20111102150255.GA6973@debian Cc: stable@vger.kernel.org Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 077d85387908..0fcc6caead1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,7 +151,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their -- cgit v1.2.3 From d4d34b981a5327eec956c6cb4cce397ce6f57279 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 20:32:39 -0400 Subject: ftrace: Fix hash record accounting bug If the set_ftrace_filter is cleared by writing just whitespace to it, then the filter hash refcounts will be decremented but not updated. This causes two bugs: 1) No functions will be enabled for tracing when they all should be 2) If the users clears the set_ftrace_filter twice, it will crash ftrace: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/git/linux-trace.git/kernel/trace/ftrace.c:1384 __ftrace_hash_rec_update.part.27+0x157/0x1a7() Modules linked in: Pid: 2330, comm: bash Not tainted 3.1.0-test+ #32 Call Trace: [] warn_slowpath_common+0x83/0x9b [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.27+0x157/0x1a7 [] ? ftrace_regex_release+0xa7/0x10f [] ? kfree+0xe5/0x115 [] ftrace_hash_move+0x2e/0x151 [] ftrace_regex_release+0xba/0x10f [] fput+0xfd/0x1c2 [] filp_close+0x6d/0x78 [] sys_dup3+0x197/0x1c1 [] sys_dup2+0x4f/0x54 [] system_call_fastpath+0x16/0x1b ---[ end trace 77a3a7ee73794a02 ]--- Link: http://lkml.kernel.org/r/20111101141420.GA4918@debian Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fcc6caead1c..7caa4505508d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1210,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (!src->count) { free_ftrace_hash_rcu(*dst); rcu_assign_pointer(*dst, EMPTY_HASH); - return 0; + /* still need to update the function records */ + ret = 0; + goto out; } /* -- cgit v1.2.3 From 7e9a49ef542610609144d1afcd516dc3fafac4d6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Nov 2011 16:08:49 +0100 Subject: tracing/latency: Fix header output for latency tracers In case the the graph tracer (CONFIG_FUNCTION_GRAPH_TRACER) or even the function tracer (CONFIG_FUNCTION_TRACER) are not set, the latency tracers do not display proper latency header. The involved/fixed latency tracers are: wakeup_rt wakeup preemptirqsoff preemptoff irqsoff The patch adds proper handling of tracer configuration options for latency tracers, and displaying correct header info accordingly. * The current output (for wakeup tracer) with both graph and function tracers disabled is: # tracer: wakeup # -0 0d.h5 1us+: 0:120:R + [000] 7: 0:R watchdog/0 -0 0d.h5 3us+: ttwu_do_activate.clone.1 <-try_to_wake_up ... * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 55 us, #4/4, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: migration/0-6 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / cat-1129 0d..4 1us : 1129:120:R + [000] 6: 0:R migration/0 cat-1129 0d..4 2us+: ttwu_do_activate.clone.1 <-try_to_wake_up * The current output (for wakeup tracer) with only function tracer enabled is: # tracer: wakeup # cat-1140 0d..4 1us+: 1140:120:R + [000] 6: 0:R migration/0 cat-1140 0d..4 2us : ttwu_do_activate.clone.1 <-try_to_wake_up * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 207 us, #109/109, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: watchdog/1-12 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / -0 1d.h5 1us+: 0:120:R + [001] 12: 0:R watchdog/1 -0 1d.h5 3us : ttwu_do_activate.clone.1 <-try_to_wake_up Link: http://lkml.kernel.org/r/20111107150849.GE1807@m.brq.redhat.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 13 ++++++++++++- kernel/trace/trace_sched_wakeup.c | 13 ++++++++++++- 4 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b24a72d35008..b296186eb93a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2140,6 +2140,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..f8ec2291b522 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a1a3359996a7..a248c686b2b8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* -- cgit v1.2.3 From 77271ce4b2c0df0a76ad1cbb6a95b07e1f88c1ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Nov 2011 09:34:33 -0500 Subject: tracing: Add irq, preempt-count and need resched info to default trace output People keep asking how to get the preempt count, irq, and need resched info and we keep telling them to enable the latency format. Some developers think that traces without this info is completely useless, and for a lot of tasks it is useless. The first option was to enable the latency trace as the default format, but the header for the latency format is pretty useless for most tracers and it also does the timestamp in straight microseconds from the time the trace started. This is sometimes more difficult to read as the default trace is seconds from the start of boot up. Latency format: # tracer: nop # # nop latency trace v1.1.5 on 3.2.0-rc1-test+ # -------------------------------------------------------------------- # latency: 0 us, #159771/64234230, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / migratio-6 0...2 41778231us+: rcu_note_context_switch <-__schedule migratio-6 0...2 41778233us : trace_rcu_utilization <-rcu_note_context_switch migratio-6 0...2 41778235us+: rcu_sched_qs <-rcu_note_context_switch migratio-6 0d..2 41778236us+: rcu_preempt_qs <-rcu_note_context_switch migratio-6 0...2 41778238us : trace_rcu_utilization <-rcu_note_context_switch migratio-6 0...2 41778239us+: debug_lockdep_rcu_enabled <-__schedule default format: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | migration/0-6 [000] 50.025810: rcu_note_context_switch <-__schedule migration/0-6 [000] 50.025812: trace_rcu_utilization <-rcu_note_context_switch migration/0-6 [000] 50.025813: rcu_sched_qs <-rcu_note_context_switch migration/0-6 [000] 50.025815: rcu_preempt_qs <-rcu_note_context_switch migration/0-6 [000] 50.025817: trace_rcu_utilization <-rcu_note_context_switch migration/0-6 [000] 50.025818: debug_lockdep_rcu_enabled <-__schedule migration/0-6 [000] 50.025820: debug_lockdep_rcu_enabled <-__schedule The latency format header has latency information that is pretty meaningless for most tracers. Although some of the header is useful, and we can add that later to the default format as well. What is really useful with the latency format is the irqs-off, need-resched hard/softirq context and the preempt count. This commit adds the option irq-info which is on by default that adds this information: # tracer: nop # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [000] d..2 49.309305: cpuidle_get_driver <-cpuidle_idle_call -0 [000] d..2 49.309307: mwait_idle <-cpu_idle -0 [000] d..2 49.309309: need_resched <-mwait_idle -0 [000] d..2 49.309310: test_ti_thread_flag <-need_resched -0 [000] d..2 49.309312: trace_power_start.constprop.13 <-mwait_idle -0 [000] d..2 49.309313: trace_cpu_idle <-mwait_idle -0 [000] d..2 49.309315: need_resched <-mwait_idle If a user wants the old format, they can disable the 'irq-info' option: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | -0 [000] 49.309305: cpuidle_get_driver <-cpuidle_idle_call -0 [000] 49.309307: mwait_idle <-cpu_idle -0 [000] 49.309309: need_resched <-mwait_idle -0 [000] 49.309310: test_ti_thread_flag <-need_resched -0 [000] 49.309312: trace_power_start.constprop.13 <-mwait_idle -0 [000] 49.309313: trace_cpu_idle <-mwait_idle -0 [000] 49.309315: need_resched <-mwait_idle Requested-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++++++++---- kernel/trace/trace.h | 1 + kernel/trace/trace_output.c | 16 ++++++++++++++-- 3 files changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e158cc84ced..fa5cee639962 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -426,6 +427,7 @@ static const char *trace_options[] = { "record-cmd", "overwrite", "disable_on_free", + "irq-info", NULL }; @@ -1857,10 +1859,20 @@ static void print_lat_help_header(struct seq_file *m) static void print_func_help_header(struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } +static void print_func_help_header_irq(struct seq_file *m) +{ + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | |||| | |\n"); +} void print_trace_header(struct seq_file *m, struct trace_iterator *iter) @@ -2170,8 +2182,12 @@ void trace_default_header(struct seq_file *m) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(m); + else + print_func_help_header(m); + } } } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f8ec2291b522..2c2657462ac3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -655,6 +655,7 @@ enum trace_iterator_flags { TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, + TRACE_ITER_IRQ_INFO = 0x800000, }; /* diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; char comm[TASK_COMM_LEN]; + int ret; trace_find_cmdline(entry->pid, comm); - return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, iter->cpu, secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + comm, entry->pid, iter->cpu); + if (!ret) + return 0; + + if (trace_flags & TRACE_ITER_IRQ_INFO) { + ret = trace_print_lat_fmt(s, entry); + if (!ret) + return 0; + } + + return trace_seq_printf(s, " %5lu.%06lu: ", + secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) -- cgit v1.2.3 From 39eaf7ef884dcc44f7ff1bac803ca2a1dcf43544 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Nov 2011 10:35:16 -0500 Subject: tracing: Add entries in buffer and total entries to default output header Knowing the number of event entries in the ring buffer compared to the total number that were written is useful information. The latency format gives this information and there's no reason that the default format does not. This information is now added to the default header, along with the number of online CPUs: # tracer: nop # # entries-in-buffer/entries-written: 159836/64690869 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [000] ...2 49.442971: local_touch_nmi <-cpu_idle -0 [000] d..2 49.442973: enter_idle <-cpu_idle -0 [000] d..2 49.442974: atomic_notifier_call_chain <-enter_idle -0 [000] d..2 49.442976: __atomic_notifier_call_chain <-atomic_notifier The above shows that the trace contains 159836 entries, but 64690869 were written. One could figure out that there were 64531033 entries that were dropped. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 72 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 25 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa5cee639962..7392070ffc39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1845,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +{ + unsigned long count; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + *total += count; + } else + *total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + *entries += count; + } +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1857,14 +1884,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_func_help_header(struct seq_file *m) +static void print_event_info(struct trace_array *tr, struct seq_file *m) { + unsigned long total; + unsigned long entries; + + get_total_entries(tr, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +{ + print_event_info(tr, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct seq_file *m) +static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) { + print_event_info(tr, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -1881,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long entries = 0; - unsigned long total = 0; - unsigned long count; + unsigned long entries; + unsigned long total; const char *name = "preemption"; - int cpu; if (type) name = type->name; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - total += count; - } else - total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - entries += count; - } + get_total_entries(tr, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2184,9 +2206,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(m); + print_func_help_header_irq(iter->tr, m); else - print_func_help_header(m); + print_func_help_header(iter->tr, m); } } } -- cgit v1.2.3 From d3d9acf646679c1981032b0985b386d12fccc60c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Nov 2011 08:49:49 -0800 Subject: trace_events_filter: Use rcu_assign_pointer() when setting ftrace_event_call->filter ftrace_event_call->filter is sched RCU protected but didn't use rcu_assign_pointer(). Use it. TODO: Add proper __rcu annotation to call->filter and all its users. -v2: Use RCU_INIT_POINTER() for %NULL clearing as suggested by Eric. Link: http://lkml.kernel.org/r/20111123164949.GA29639@google.com Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: stable@kernel.org # (2.6.39+) Signed-off-by: Tejun Heo Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d074979..d6e7926dcd26 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1686,7 +1686,7 @@ static int replace_system_preds(struct event_subsystem *system, * replace the filter for the call. */ filter = call->filter; - call->filter = filter_item->filter; + rcu_assign_pointer(call->filter, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1741,7 +1741,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) filter = call->filter; if (!filter) goto out_unlock; - call->filter = NULL; + RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); @@ -1782,7 +1782,7 @@ out: * string */ tmp = call->filter; - call->filter = filter; + rcu_assign_pointer(call->filter, filter); if (tmp) { /* Make sure the call is done with the filter */ synchronize_sched(); -- cgit v1.2.3 From cb59974742aea24adf6637eb0c4b8e7b48bca6fb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 31 Oct 2011 11:07:42 +0200 Subject: tracing: fix event_subsystem ref counting Fix a bug introduced by e9dbfae5, which prevents event_subsystem from ever being released. Ref_count was added to keep track of subsystem users, not for counting events. Subsystem is created with ref_count = 1, so there is no need to increment it for every event, we have nr_events for that. Fix this by touching ref_count only when we actually have a new user - subsystem_open(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Link: http://lkml.kernel.org/r/1320052062-7846-1-git-send-email-idryomov@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } -- cgit v1.2.3 From 27b14b56af081ec7edeefb3a38b2c9577cc5ef48 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d6e7926dcd26..95dc31efd6dd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit v1.2.3 From c7c6ec8becaf742b223c7b491f4893014be23a07 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 10:45:23 -0400 Subject: ftrace: Remove force undef config value left for testing A forced undef of a config value was used for testing and was accidently left in during the final commit. This causes x86 to run slower than needed while running function tracing as well as causes the function graph selftest to fail when DYNMAIC_FTRACE is not set. This is because the code in MCOUNT expects the ftrace code to be processed with the config value set that happened to be forced not set. The forced config option was left in by: commit 6331c28c962561aee59e5a493b7556a4bb585957 ftrace: Fix dynamic selftest failure on some archs Link: http://lkml.kernel.org/r/20111102150255.GA6973@debian Cc: stable@vger.kernel.org Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900b409543db..1276b3ca1598 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -152,7 +152,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their -- cgit v1.2.3 From ddf6e0e50723b62ac76ed18eb53e9417c6eefba7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 20:32:39 -0400 Subject: ftrace: Fix hash record accounting bug If the set_ftrace_filter is cleared by writing just whitespace to it, then the filter hash refcounts will be decremented but not updated. This causes two bugs: 1) No functions will be enabled for tracing when they all should be 2) If the users clears the set_ftrace_filter twice, it will crash ftrace: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/git/linux-trace.git/kernel/trace/ftrace.c:1384 __ftrace_hash_rec_update.part.27+0x157/0x1a7() Modules linked in: Pid: 2330, comm: bash Not tainted 3.1.0-test+ #32 Call Trace: [] warn_slowpath_common+0x83/0x9b [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.27+0x157/0x1a7 [] ? ftrace_regex_release+0xa7/0x10f [] ? kfree+0xe5/0x115 [] ftrace_hash_move+0x2e/0x151 [] ftrace_regex_release+0xba/0x10f [] fput+0xfd/0x1c2 [] filp_close+0x6d/0x78 [] sys_dup3+0x197/0x1c1 [] sys_dup2+0x4f/0x54 [] system_call_fastpath+0x16/0x1b ---[ end trace 77a3a7ee73794a02 ]--- Link: http://lkml.kernel.org/r/20111101141420.GA4918@debian Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1276b3ca1598..b1e8943fed1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1211,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (!src->count) { free_ftrace_hash_rcu(*dst); rcu_assign_pointer(*dst, EMPTY_HASH); - return 0; + /* still need to update the function records */ + ret = 0; + goto out; } /* -- cgit v1.2.3 From a8eecf2248a45bf69f0625b23c003ad2ccd765ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 11:01:15 -0700 Subject: trace: Allow ftrace_dump() to be called from modules Add an EXPORT_SYMBOL_GPL() so that rcutorture can dump the trace buffer upon detection of an RCU error. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f..a043d224adf6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4775,6 +4775,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { __ftrace_dump(true, oops_dump_mode); } +EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { -- cgit v1.2.3 From 30fb6aa74011dcf595f306ca2727254d708b786e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 5 Dec 2011 18:22:48 +0100 Subject: ftrace: Fix unregister ftrace_ops accounting Multiple users of the function tracer can register their functions with the ftrace_ops structure. The accounting within ftrace will update the counter on each function record that is being traced. When the ftrace_ops filtering adds or removes functions, the function records will be updated accordingly if the ftrace_ops is still registered. When a ftrace_ops is removed, the counter of the function records, that the ftrace_ops traces, are decremented. When they reach zero the functions that they represent are modified to stop calling the mcount code. When changes are made, the code is updated via stop_machine() with a command passed to the function to tell it what to do. There is an ENABLE and DISABLE command that tells the called function to enable or disable the functions. But the ENABLE is really a misnomer as it should just update the records, as records that have been enabled and now have a count of zero should be disabled. The DISABLE command is used to disable all functions regardless of their counter values. This is the big off switch and is not the complement of the ENABLE command. To make matters worse, when a ftrace_ops is unregistered and there is another ftrace_ops registered, neither the DISABLE nor the ENABLE command are set when calling into the stop_machine() function and the records will not be updated to match their counter. A command is passed to that function that will update the mcount code to call the registered callback directly if it is the only one left. This means that the ftrace_ops that is still registered will have its callback called by all functions that have been set for it as well as the ftrace_ops that was just unregistered. Here's a way to trigger this bug. Compile the kernel with CONFIG_FUNCTION_PROFILER set and with CONFIG_FUNCTION_GRAPH not set: CONFIG_FUNCTION_PROFILER=y # CONFIG_FUNCTION_GRAPH is not set This will force the function profiler to use the function tracer instead of the function graph tracer. # cd /sys/kernel/debug/tracing # echo schedule > set_ftrace_filter # echo function > current_tracer # cat set_ftrace_filter schedule # cat trace # tracer: nop # # entries-in-buffer/entries-written: 692/68108025 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | kworker/0:2-909 [000] .... 531.235574: schedule <-worker_thread -0 [001] .N.. 531.235575: schedule <-cpu_idle kworker/0:2-909 [000] .... 531.235597: schedule <-worker_thread sshd-2563 [001] .... 531.235647: schedule <-schedule_hrtimeout_range_clock # echo 1 > function_profile_enabled # echo 0 > function_porfile_enabled # cat set_ftrace_filter schedule # cat trace # tracer: function # # entries-in-buffer/entries-written: 159701/118821262 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [002] ...1 604.870655: local_touch_nmi <-cpu_idle -0 [002] d..1 604.870655: enter_idle <-cpu_idle -0 [002] d..1 604.870656: atomic_notifier_call_chain <-enter_idle -0 [002] d..1 604.870656: __atomic_notifier_call_chain <-atomic_notifier_call_chain The same problem could have happened with the trace_probe_ops, but they are modified with the set_frace_filter file which does the update at closure of the file. The simple solution is to change ENABLE to UPDATE and call it every time an ftrace_ops is unregistered. Link: http://lkml.kernel.org/r/1323105776-26961-3-git-send-email-jolsa@redhat.com Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1e8943fed1d..25b4f4da0fe8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -948,7 +948,7 @@ struct ftrace_func_probe { }; enum { - FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_START_FUNC_RET = (1 << 3), @@ -1519,7 +1519,7 @@ int ftrace_text_reserved(void *start, void *end) static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int update) { unsigned long ftrace_addr; unsigned long flag = 0UL; @@ -1527,17 +1527,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If we are enabling tracing: + * If we are updating calls: * * If the record has a ref count, then we need to enable it * because someone is using it. * * Otherwise we make sure its disabled. * - * If we are disabling tracing, then disable all records that + * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (update && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ @@ -1553,7 +1553,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_nop(NULL, rec, ftrace_addr); } -static void ftrace_replace_code(int enable) +static void ftrace_replace_code(int update) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1567,7 +1567,7 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FREE) continue; - failed = __ftrace_replace_code(rec, enable); + failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1623,7 +1623,7 @@ static int __ftrace_modify_code(void *data) */ function_trace_stop++; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); @@ -1691,7 +1691,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return -ENODEV; ftrace_start_up++; - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; /* ops marked global share the filter hashes */ if (ops->flags & FTRACE_OPS_FL_GLOBAL) { @@ -1743,8 +1743,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - if (!ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -1766,7 +1765,7 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); } static void ftrace_shutdown_sysctl(void) @@ -2919,7 +2918,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); @@ -3107,7 +3106,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash, iter->hash); if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From c88fd8634ea68e74c7d19fd2621b4078fd22864c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Aug 2011 09:53:39 -0400 Subject: ftrace: Allow archs to modify code without stop machine The stop machine method to modify all functions in the kernel (some 20,000 of them) is the safest way to do so across all archs. But some archs may not need this big hammer approach to modify code on SMP machines, and can simply just update the code it needs. Adding a weak function arch_ftrace_update_code() that now does the stop machine, will also let any arch override this method. If the arch needs to check the system and then decide if it can avoid stop machine, it can still call ftrace_run_stop_machine() to use the old method. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 253 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 215 insertions(+), 38 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25b4f4da0fe8..655b432fb890 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -947,13 +947,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; -enum { - FTRACE_UPDATE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), -}; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -1307,6 +1300,28 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns 1 if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_location(unsigned long ip) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip == ip) + return 1; + } while_for_each_ftrace_rec(); + + return 0; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1475,7 +1490,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -1517,15 +1544,10 @@ int ftrace_text_reserved(void *start, void *end) return 0; } - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { - unsigned long ftrace_addr; unsigned long flag = 0UL; - ftrace_addr = (unsigned long)FTRACE_ADDR; - /* * If we are updating calls: * @@ -1537,20 +1559,74 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (update && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return 0; + return FTRACE_UPDATE_IGNORE; if (flag) { - rec->flags |= FTRACE_FL_ENABLED; + if (update) + rec->flags |= FTRACE_FL_ENABLED; + return FTRACE_UPDATE_MAKE_CALL; + } + + if (update) + rec->flags &= ~FTRACE_FL_ENABLED; + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); } - rec->flags &= ~FTRACE_FL_ENABLED; - return ftrace_make_nop(NULL, rec, ftrace_addr); + return -1; /* unknow ftrace bug */ } static void ftrace_replace_code(int update) @@ -1576,6 +1652,78 @@ static void ftrace_replace_code(int update) } while_for_each_ftrace_rec(); } +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { @@ -1617,12 +1765,6 @@ static int __ftrace_modify_code(void *data) { int *command = data; - /* - * Do not call function tracer while we update the code. - * We are in stop machine, no worrying about races. - */ - function_trace_stop++; - if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1636,21 +1778,33 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - return 0; } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + static void ftrace_run_update_code(int command) { int ret; @@ -1659,8 +1813,31 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; + /* + * Do not call function tracer while we update the code. + * We are in stop machine. + */ + function_trace_stop++; - stop_machine(__ftrace_modify_code, &command, NULL); + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); -- cgit v1.2.3 From 3208230983a0ee3d95be22d463257e530c684956 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 14:42:37 -0500 Subject: ftrace: Remove usage of "freed" records Records that are added to the function trace table are permanently there, except for modules. By separating out the modules to their own pages that can be freed in one shot we can remove the "freed" flag and simplify some of the record management. Another benefit of doing this is that we can also move the records around; sort them. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 51 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 655b432fb890..be6888f40d2b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -996,8 +996,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; - static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1421,32 +1419,8 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - rec->freelist = ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - FTRACE_WARN_ON_ONCE(1); - ftrace_free_records = NULL; - return NULL; - } - - ftrace_free_records = rec->freelist; - memset(rec, 0, sizeof(*rec)); - return rec; - } - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) { /* allocate another page */ @@ -1639,10 +1613,6 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - /* Skip over free records */ - if (rec->flags & FTRACE_FL_FREE) - continue; - failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); @@ -2007,11 +1977,8 @@ static int ftrace_update_code(struct module *mod) * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) { - ftrace_free_rec(p); - /* Game over */ + if (!ftrace_code_disable(mod, p)) break; - } ftrace_update_cnt++; @@ -2026,10 +1993,8 @@ static int ftrace_update_code(struct module *mod) */ if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); - if (failed) { + if (failed) ftrace_bug(failed, p->ip); - ftrace_free_rec(p); - } } } @@ -2223,9 +2188,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FREE) || - - ((iter->flags & FTRACE_ITER_FILTER) && + if (((iter->flags & FTRACE_ITER_FILTER) && !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && @@ -2602,7 +2565,6 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { ret = enter_record(hash, rec, not); if (ret < 0) { @@ -3446,9 +3408,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FREE) - continue; - if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; @@ -3566,6 +3525,27 @@ static int ftrace_process_locs(struct module *mod, unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (mod) { + if (!ftrace_pages) + return -ENOMEM; + + /* + * If the last page was full, it will be + * allocated anyway. + */ + if (ftrace_pages->index != ENTRIES_PER_PAGE) { + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return -ENOMEM; + ftrace_pages = ftrace_pages->next; + } + } + p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -3599,9 +3579,13 @@ static int ftrace_process_locs(struct module *mod, } #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; + struct ftrace_page **last_pg; struct ftrace_page *pg; mutex_lock(&ftrace_lock); @@ -3609,16 +3593,30 @@ void ftrace_release_mod(struct module *mod) if (ftrace_disabled) goto out_unlock; - do_for_each_ftrace_rec(pg, rec) { + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; if (within_module_core(rec->ip, mod)) { /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. + * As core pages are first, the first + * page should never be a module page. */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + free_page((unsigned long)pg); + } else + last_pg = &pg->next; + } out_unlock: mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From a79008755497daff157f5294c02e3b940641cc11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:23:44 -0500 Subject: ftrace: Allocate the mcount record pages as groups Allocate the mcount record pages as a group of pages as big as can be allocated and waste no more than a single page. Grouping the mcount pages as much as possible helps with cache locality, as we do not need to redirect with descriptors as we cross from page to page. It also allows us to do more with the records later on (sort them with bigger benefits). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 51 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index be6888f40d2b..2e7218869fe9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -983,12 +983,13 @@ static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; + struct dyn_ftrace *records; int index; - struct dyn_ftrace records[]; + int size; }; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) /* estimate from running different kernels */ #define NR_TO_INIT 10000 @@ -1421,14 +1422,10 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) { - /* allocate another page */ - ftrace_pages->next = - (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return NULL; - } + if (ftrace_pages->index == ftrace_pages->size) { + /* We should have allocated enough */ + if (WARN_ON(!ftrace_pages->next)) + return NULL; ftrace_pages = ftrace_pages->next; } @@ -2005,47 +2002,106 @@ static int ftrace_update_code(struct module *mod) return 0; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count) { - struct ftrace_page *pg; + int order; int cnt; - int i; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. + * We want to fill as much as possible. No more than a page + * may be empty. */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; - pg = ftrace_pages = ftrace_pages_start; + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; - /* If we fail, we'll try later anyway */ - if (!pg->next) + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) break; + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + pg = pg->next; } + return start_pg; + + free_pages: + while (start_pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; +} + +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + int cnt; + + if (!num_to_init) { + pr_info("ftrace: No functions to be traced?\n"); + return -1; + } + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + return 0; } @@ -3520,30 +3576,45 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg; + unsigned long count; unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + pg = ftrace_allocate_pages(count); + if (!pg) + return -ENOMEM; mutex_lock(&ftrace_lock); + /* * Core and each module needs their own pages, as * modules will free them when they are removed. * Force a new page to be allocated for modules. */ - if (mod) { + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = pg; + } else { if (!ftrace_pages) - return -ENOMEM; + goto out; - /* - * If the last page was full, it will be - * allocated anyway. - */ - if (ftrace_pages->index != ENTRIES_PER_PAGE) { - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return -ENOMEM; - ftrace_pages = ftrace_pages->next; + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; } + + ftrace_pages->next = pg; + ftrace_pages = pg; } p = start; @@ -3557,7 +3628,8 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - ftrace_record_ip(addr); + if (!ftrace_record_ip(addr)) + break; } /* @@ -3573,9 +3645,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_update_code(mod); if (!mod) local_irq_restore(flags); + ret = 0; + out: mutex_unlock(&ftrace_lock); - return 0; + return ret; } #ifdef CONFIG_MODULES @@ -3587,6 +3661,7 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page **last_pg; struct ftrace_page *pg; + int order; mutex_lock(&ftrace_lock); @@ -3613,7 +3688,9 @@ void ftrace_release_mod(struct module *mod) ftrace_pages = next_to_ftrace_page(last_pg); *last_pg = pg->next; - free_page((unsigned long)pg); + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); } else last_pg = &pg->next; } -- cgit v1.2.3 From 85ae32ae019bc1c2cc22e5f51fe0c9f2812ef68c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:30:31 -0500 Subject: ftrace: Replace record newlist with record page list As new functions come in to be initalized from mcount to nop, they are done by groups of pages. Whether it is the core kernel or a module. There's no need to keep track of these on a per record basis. At startup, and as any module is loaded, the functions to be traced are stored in a group of pages and added to the function list at the end. We just need to keep a pointer to the first page of the list that was added, and use that to know where to start on the list for initializing functions. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e7218869fe9..366d7881f188 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -977,8 +977,6 @@ static struct ftrace_ops global_ops = { .filter_hash = EMPTY_HASH, }; -static struct dyn_ftrace *ftrace_new_addrs; - static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { @@ -988,6 +986,8 @@ struct ftrace_page { int size; }; +static struct ftrace_page *ftrace_new_pgs; + #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -1445,8 +1445,6 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->newlist = ftrace_new_addrs; - ftrace_new_addrs = rec; return rec; } @@ -1936,9 +1934,11 @@ static int ops_traces_mod(struct ftrace_ops *ops) static int ftrace_update_code(struct module *mod) { + struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + int i; /* * When adding a module, we need to check if tracers are @@ -1960,41 +1960,44 @@ static int ftrace_update_code(struct module *mod) start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - while (ftrace_new_addrs) { + for (pg = ftrace_new_pgs; pg; pg = pg->next) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; + for (i = 0; i < pg->index; i++) { + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - p = ftrace_new_addrs; - ftrace_new_addrs = p->newlist; - p->flags = ref; + p = &pg->records[i]; + p->flags = ref; - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) - break; + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; - ftrace_update_cnt++; + ftrace_update_cnt++; - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p->ip); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p->ip); + } } } + ftrace_new_pgs = NULL; + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; @@ -3632,6 +3635,9 @@ static int ftrace_process_locs(struct module *mod, break; } + /* These new locations need to be initialized */ + ftrace_new_pgs = pg; + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3 From 68950619f8c82e468d8976130462617abea605a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 17:06:45 -0500 Subject: ftrace: Sort the mcount records on each page Sort records by ip locations of the ftrace mcount calls on each of the set of pages in the function list. This helps in localizing cache usuage when updating the function locations, as well as gives us the ability to quickly find an ip location in the list. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 366d7881f188..2d6f8bcd1884 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -3575,6 +3576,29 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } +static void ftrace_swap_recs(void *a, void *b, int size) +{ + struct dyn_ftrace *reca = a; + struct dyn_ftrace *recb = b; + struct dyn_ftrace t; + + t = *reca; + *reca = *recb; + *recb = t; +} + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -3638,6 +3662,11 @@ static int ftrace_process_locs(struct module *mod, /* These new locations need to be initialized */ ftrace_new_pgs = pg; + /* Make each individual set of pages sorted by ips */ + for (; pg; pg = pg->next) + sort(pg->records, pg->index, sizeof(struct dyn_ftrace), + ftrace_cmp_recs, ftrace_swap_recs); + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3 From 5855fead9cc358adebd6bdeec202d040c623ae38 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 19:27:42 -0500 Subject: ftrace: Use bsearch to find record ip Now that each set of pages in the function list are sorted by ip, we can use bsearch to find a record within each set of pages. This speeds up the ftrace_location() function by magnitudes. For archs (like x86) that need to add a breakpoint at every function that will be converted from a nop to a callback and vice versa, the breakpoint callback needs to know if the breakpoint was for ftrace or not. It requires finding the breakpoint ip within the records. Doing a linear search is extremely inefficient. It is a must to be able to do a fast binary search to find these locations. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d6f8bcd1884..dcd3a814d39b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1300,6 +1301,19 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + /** * ftrace_location - return true if the ip giving is a traced location * @ip: the instruction pointer to check @@ -1313,11 +1327,17 @@ int ftrace_location(unsigned long ip) { struct ftrace_page *pg; struct dyn_ftrace *rec; + struct dyn_ftrace key; - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip == ip) + key.ip = ip; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) return 1; - } while_for_each_ftrace_rec(); + } return 0; } @@ -3587,18 +3607,6 @@ static void ftrace_swap_recs(void *a, void *b, int size) *recb = t; } -static int ftrace_cmp_recs(const void *a, const void *b) -{ - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; - - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) - return -1; - return 0; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) -- cgit v1.2.3 From c842e975520f8ab09e293cc92f51a1f396251fd5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 18:44:44 -0500 Subject: ftrace: Fix ftrace hash record update with notrace When disabling the "notrace" records, that means we want to trace them. If the notrace_hash is zero, it means that we want to trace all records. But to disable a zero notrace_hash means nothing. The check for the notrace_hash count was incorrect with: if (hash && !hash->count) return With the correct comment above it that states that we do nothing if the notrace_hash has zero count. But !hash also means that the notrace hash has zero count. I think this was done to protect against dereferencing NULL. But if !hash is true, then we go through the following loop without doing a single thing. Fix it to: if (!hash || !hash->count) return; Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcd3a814d39b..a383d6c67bfa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1381,7 +1381,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (hash && !hash->count) + if (!hash || !hash->count) return; } -- cgit v1.2.3 From 06a51d9307380c78bb5c92e68fc80ad2c7d7f890 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 19:07:36 -0500 Subject: ftrace: Create ftrace_hash_empty() helper routine There are two types of hashes in the ftrace_ops; one type is the filter_hash and the other is the notrace_hash. Either one may be null, meaning it has no elements. But when elements are added, the hash is allocated. Throughout the code, a check needs to be made to see if a hash exists or the hash has elements, but the check if the hash exists is usually missing causing the possible "NULL pointer dereference bug". Add a helper routine called "ftrace_hash_empty()" that returns true if the hash doesn't exist or its count is zero. As they mean the same thing. Last-bug-reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a383d6c67bfa..e1ee07f81ca2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -999,6 +999,11 @@ static struct ftrace_page *ftrace_new_pgs; static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; +static bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1007,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) struct hlist_head *hhd; struct hlist_node *n; - if (!hash->count) + if (ftrace_hash_empty(hash)) return NULL; if (hash->size_bits > 0) @@ -1151,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; /* Empty hash? */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return new_hash; size = 1 << hash->size_bits; @@ -1276,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); - if ((!filter_hash || !filter_hash->count || + if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && - (!notrace_hash || !notrace_hash->count || + (ftrace_hash_empty(notrace_hash) || !ftrace_lookup_ip(notrace_hash, ip))) ret = 1; else @@ -1371,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; @@ -1381,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return; } @@ -1398,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1407,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash && in_hash && !in_other_hash) match = 1; else if (!filter_hash && in_hash && - (in_other_hash || !other_hash->count)) + (in_other_hash || ftrace_hash_empty(other_hash))) match = 1; } if (!match) @@ -1950,7 +1955,7 @@ static int ops_traces_mod(struct ftrace_ops *ops) struct ftrace_hash *hash; hash = ops->filter_hash; - return !!(!hash || !hash->count); + return ftrace_hash_empty(hash); } static int ftrace_update_code(struct module *mod) @@ -2320,7 +2325,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; -- cgit v1.2.3 From fc13cb0ce45296f331263a6034aa1814203e1ac3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:41:25 -0500 Subject: ftrace: Allow other users of function tracing to use the output listing The function tracer is set up to allow any other subsystem (like perf) to use it. Ftrace already has a way to list what functions are enabled by the global_ops. It would be very helpful to let other users of the function tracer to be able to use the same code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e1ee07f81ca2..5b105c5ddc0c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2134,14 +2134,6 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) return 0; } -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), -}; - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2249,7 +2241,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -2305,7 +2297,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -2414,6 +2406,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2442,6 +2435,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) iter->pg = ftrace_pages_start; iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2462,7 +2456,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) mutex_unlock(&ftrace_lock); } -static int +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * ftrace_regex_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { @@ -2542,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static loff_t +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -3095,14 +3105,14 @@ out_unlock: return ret; } -static ssize_t +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return ftrace_regex_write(file, ubuf, cnt, ppos, 1); } -static ssize_t +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3292,8 +3302,7 @@ static void __init set_ftrace_early_filters(void) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } -static int -ftrace_regex_release(struct inode *inode, struct file *file) +int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; -- cgit v1.2.3 From 69a3083c4a7df0322d97bb2b43a33cb12af8131a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 15:21:16 -0500 Subject: ftrace: Decouple hash items from showing filtered functions The set_ftrace_filter shows "hashed" functions, which are functions that are added with operations to them (like traceon and traceoff). As other subsystems may be able to show what functions they are using for function tracing, the hash items should no longer be shown just because the FILTER flag is set. As they have nothing to do with other subsystems filters. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b105c5ddc0c..5728d9aa632e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2198,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + if (iter->func_pos > *pos) return NULL; @@ -2343,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p) { - if (iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); - - return NULL; - } + if (!p) + return t_hash_start(m, pos); return iter; } @@ -2541,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, - inode, file); + return ftrace_regex_open(&global_ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); } static int -- cgit v1.2.3 From d2d45c7a03a2b1a14159cbb665e9dd60991a7d4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:44:09 -0500 Subject: tracing: Have stack_tracer use a separate list of functions The stack_tracer is used to look at every function and check if the current stack is bigger than the last recorded max stack size. When a new max is found, then it saves that stack off. Currently the stack tracer is limited by the global_ops of the function tracer. As the stack tracer has nothing to do with the ftrace function tracer, except that it uses it as its internal engine, the stack tracer should have its own list. A new file is added to the tracing debugfs directory called: stack_trace_filter that can be used to select which functions you want to check the stack on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d97..0398b7c7afd6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,7 +133,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t @@ -311,6 +310,21 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -358,6 +372,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3 From 2a85a37f168d2b4d74d493b578af4dc9032be92e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 21:57:44 -0500 Subject: ftrace: Allow access to the boot time function enabling Change set_ftrace_early_filter() to ftrace_set_early_filter() and make it a global function. This will allow other subsystems in the kernel to be able to enable function tracing at start up and reuse the ftrace function parsing code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5728d9aa632e..683d559a0eef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3279,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init -set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; @@ -3293,9 +3293,9 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); -- cgit v1.2.3 From 762e1207889b3451c50d365b741af6f9ce958886 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 22:01:00 -0500 Subject: tracing: Have stack tracing set filtered functions at boot Add stacktrace_filter= to the kernel command line that lets the user pick specific functions to check the stack on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0398b7c7afd6..d4545f49242e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,9 @@ #include #include #include + +#include + #include "trace.h" #define STACK_TRACE_ENTRIES 500 @@ -352,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + static __init int enable_stacktrace(char *str) { + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; return 1; @@ -375,6 +383,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace_filter", 0444, d_tracer, NULL, &stack_trace_filter_fops); + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3 From 38b78eb855409a05f9d370228bec1955e6878e08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Dec 2011 14:31:35 -0800 Subject: tracing: Factorize filter creation There are four places where new filter for a given filter string is created, which involves several different steps. This patch factors those steps into create_[system_]filter() functions which in turn make use of create_filter_{start|finish}() for common parts. The only functional change is that if replace_filter_string() is requested and fails, creation fails without any side effect instead of being ignored. Note that system filter is now installed after the processing is complete which makes freeing before and then restoring filter string on error unncessary. -v2: Rebased to resolve conflict with 49aa29513e and updated both create_filter() functions to always set *filterp instead of requiring the caller to clear it to %NULL on entry. Link: http://lkml.kernel.org/r/1323988305-1469-2-git-send-email-tj@kernel.org Signed-off-by: Tejun Heo Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 283 +++++++++++++++++++------------------ 1 file changed, 142 insertions(+), 141 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f04cc3136bd3..24aee7127451 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system, return -ENOMEM; } +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, filter_str, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(system, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; - struct event_filter *tmp; int err = 0; mutex_lock(&event_mutex); @@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) { - kfree(ps); - goto out_unlock; - } - - replace_filter_string(filter, filter_string); - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) { - append_filter_err(ps, filter); - goto out; - } + err = create_filter(call, filter_string, true, &filter); - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; -out: /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ - tmp = call->filter; - rcu_assign_pointer(call->filter, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); + if (filter) { + struct event_filter *tmp = call->filter; + + if (!err) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + filter_disable(call); + + rcu_assign_pointer(call->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } } - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); out_unlock: mutex_unlock(&event_mutex); @@ -1811,7 +1902,6 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; int err = 0; @@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) - goto out; - - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - - /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). - */ - __free_filter(system->filter); - system->filter = filter; - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) - goto err_filter; - - err = replace_system_preds(system, ps, filter_string); - if (err) - goto err_filter; - -out: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); + err = create_system_filter(system, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } out_unlock: mutex_unlock(&event_mutex); return err; - -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; } #ifdef CONFIG_PERF_EVENTS @@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct filter_parse_state *ps; struct ftrace_event_call *call; mutex_lock(&event_mutex); @@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_filter(); - if (!filter) { - err = PTR_ERR(filter); - goto out_unlock; - } - - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); + err = create_filter(call, filter_str, false, &filter); if (!err) event->filter = filter; - -free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - -free_filter: - if (err) + else __free_filter(filter); out_unlock: @@ -1954,43 +1991,6 @@ out_unlock: #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" -static int test_get_filter(char *filter_str, struct ftrace_event_call *call, - struct event_filter **pfilter) -{ - struct event_filter *filter; - struct filter_parse_state *ps; - int err = -ENOMEM; - - filter = __alloc_filter(); - if (!filter) - goto out; - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) - *pfilter = filter; - - free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - - free_filter: - if (err) - __free_filter(filter); - - out: - return err; -} - #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ @@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = test_get_filter(d->filter, &event_ftrace_test_filter, - &filter); + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); + __free_filter(filter); break; } -- cgit v1.2.3 From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 04:33:43 -0400 Subject: switch debugfs to umode_t Signed-off-by: Al Viro --- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 16fc34a0806f..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f..660b069a0f99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4385,7 +4385,7 @@ static const struct file_operations trace_options_core_fops = { }; struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..0154c0b850de 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); -- cgit v1.2.3