From ae63b31e4d0e2ec09c569306ea46f664508ef717 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 May 2012 23:09:03 -0400
Subject: tracing: Separate out trace events from global variables

The trace events for ftrace are all defined via global variables.
The arrays of events and event systems are linked to a global list.
This prevents multiple users of the event system (what to enable and
what not to).

By adding descriptors to represent the event/file relation, as well
as to which trace_array descriptor they are associated with, allows
for more than one set of events to be defined. Once the trace events
files have a link between the trace event and the trace_array they
are associated with, we can create multiple trace_arrays that can
record separate events in separate buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c               |   8 +
 kernel/trace/trace.h               |  39 +-
 kernel/trace/trace_events.c        | 776 +++++++++++++++++++++++++------------
 kernel/trace/trace_events_filter.c |   5 +-
 4 files changed, 580 insertions(+), 248 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade56981..932931897b8d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -189,6 +189,8 @@ unsigned long long ns2usecs(cycle_t nsec)
  */
 static struct trace_array	global_trace;
 
+LIST_HEAD(ftrace_trace_arrays);
+
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
 int filter_current_check_discard(struct ring_buffer *buffer,
@@ -5359,6 +5361,12 @@ __init static int tracer_alloc_buffers(void)
 
 	register_die_notifier(&trace_die_notifier);
 
+	global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+
+	INIT_LIST_HEAD(&global_trace.systems);
+	INIT_LIST_HEAD(&global_trace.events);
+	list_add(&global_trace.list, &ftrace_trace_arrays);
+
 	while (trace_boot_options) {
 		char *option;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2081971367ea..037f7eb03d69 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -158,13 +158,39 @@ struct trace_array_cpu {
  */
 struct trace_array {
 	struct ring_buffer	*buffer;
+	struct list_head	list;
 	int			cpu;
 	int			buffer_disabled;
+	unsigned int		flags;
 	cycle_t			time_start;
+	struct dentry		*dir;
+	struct dentry		*event_dir;
+	struct list_head	systems;
+	struct list_head	events;
 	struct task_struct	*waiter;
 	struct trace_array_cpu	*data[NR_CPUS];
 };
 
+enum {
+	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)
+};
+
+extern struct list_head ftrace_trace_arrays;
+
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+	struct trace_array *tr;
+
+	tr = list_entry(ftrace_trace_arrays.prev,
+			typeof(*tr), list);
+	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+	return tr;
+}
+
 #define FTRACE_CMP_TYPE(var, type) \
 	__builtin_types_compatible_p(typeof(var), type *)
 
@@ -851,12 +877,19 @@ struct event_filter {
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
-	struct dentry		*entry;
 	struct event_filter	*filter;
-	int			nr_events;
 	int			ref_count;
 };
 
+struct ftrace_subsystem_dir {
+	struct list_head		list;
+	struct event_subsystem		*subsystem;
+	struct trace_array		*tr;
+	struct dentry			*entry;
+	int				ref_count;
+	int				nr_events;
+};
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
@@ -914,7 +947,7 @@ extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int apply_event_filter(struct ftrace_event_call *call,
 			      char *filter_string);
-extern int apply_subsystem_event_filter(struct event_subsystem *system,
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 					char *filter_string);
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..439955239bae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -36,6 +36,19 @@ EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
 
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file)			\
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
+		list_for_each_entry(file, &tr->events, list)
+
+#define do_for_each_event_file_safe(tr, file)			\
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
+		struct ftrace_event_file *___n;				\
+		list_for_each_entry_safe(file, ___n, &tr->events, list)
+
+#define while_for_each_event_file()		\
+	}
+
 struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
 {
@@ -149,15 +162,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
 int ftrace_event_reg(struct ftrace_event_call *call,
 		     enum trace_reg type, void *data)
 {
+	struct ftrace_event_file *file = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 		return tracepoint_probe_register(call->name,
 						 call->class->probe,
-						 call);
+						 file);
 	case TRACE_REG_UNREGISTER:
 		tracepoint_probe_unregister(call->name,
 					    call->class->probe,
-					    call);
+					    file);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
@@ -183,54 +198,57 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
 void trace_event_enable_cmd_record(bool enable)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+	do_for_each_event_file(tr, file) {
+
+		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
 			continue;
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
 		} else {
 			tracing_stop_cmdline_record();
-			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
 		}
-	}
+	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_call *call,
-					int enable)
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+				       int enable)
 {
+	struct ftrace_event_call *call = file->event_call;
 	int ret = 0;
 
 	switch (enable) {
 	case 0:
-		if (call->flags & TRACE_EVENT_FL_ENABLED) {
-			call->flags &= ~TRACE_EVENT_FL_ENABLED;
-			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+			file->flags &= ~FTRACE_EVENT_FL_ENABLED;
+			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+				file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
 			}
-			call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
+			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
 		break;
 	case 1:
-		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+				file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
 			}
-			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
+			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
 					"%s\n", call->name);
 				break;
 			}
-			call->flags |= TRACE_EVENT_FL_ENABLED;
+			file->flags |= FTRACE_EVENT_FL_ENABLED;
 		}
 		break;
 	}
@@ -238,13 +256,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 	return ret;
 }
 
-static void ftrace_clear_events(void)
+static void ftrace_clear_events(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-		ftrace_event_enable_disable(call, 0);
+	list_for_each_entry(file, &tr->events, list) {
+		ftrace_event_enable_disable(file, 0);
 	}
 	mutex_unlock(&event_mutex);
 }
@@ -257,6 +275,8 @@ static void __put_system(struct event_subsystem *system)
 	if (--system->ref_count)
 		return;
 
+	list_del(&system->list);
+
 	if (filter) {
 		kfree(filter->filter_string);
 		kfree(filter);
@@ -271,24 +291,45 @@ static void __get_system(struct event_subsystem *system)
 	system->ref_count++;
 }
 
-static void put_system(struct event_subsystem *system)
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+	WARN_ON_ONCE(dir->ref_count == 0);
+	dir->ref_count++;
+	__get_system(dir->subsystem);
+}
+
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+	WARN_ON_ONCE(dir->ref_count == 0);
+	/* If the subsystem is about to be freed, the dir must be too */
+	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+
+	__put_system(dir->subsystem);
+	if (!--dir->ref_count)
+		kfree(dir);
+}
+
+static void put_system(struct ftrace_subsystem_dir *dir)
 {
 	mutex_lock(&event_mutex);
-	__put_system(system);
+	__put_system_dir(dir);
 	mutex_unlock(&event_mutex);
 }
 
 /*
  * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
  */
-static int __ftrace_set_clr_event(const char *match, const char *sub,
-				  const char *event, int set)
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+				  const char *sub, const char *event, int set)
 {
+	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
 	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+
+		call = file->event_call;
 
 		if (!call->name || !call->class || !call->class->reg)
 			continue;
@@ -307,7 +348,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 		if (event && strcmp(event, call->name) != 0)
 			continue;
 
-		ftrace_event_enable_disable(call, set);
+		ftrace_event_enable_disable(file, set);
 
 		ret = 0;
 	}
@@ -316,7 +357,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	return ret;
 }
 
-static int ftrace_set_clr_event(char *buf, int set)
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
 	char *event = NULL, *sub = NULL, *match;
 
@@ -344,7 +385,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
-	return __ftrace_set_clr_event(match, sub, event, set);
+	return __ftrace_set_clr_event(tr, match, sub, event, set);
 }
 
 /**
@@ -361,7 +402,9 @@ static int ftrace_set_clr_event(char *buf, int set)
  */
 int trace_set_clr_event(const char *system, const char *event, int set)
 {
-	return __ftrace_set_clr_event(NULL, system, event, set);
+	struct trace_array *tr = top_trace_array();
+
+	return __ftrace_set_clr_event(tr, NULL, system, event, set);
 }
 EXPORT_SYMBOL_GPL(trace_set_clr_event);
 
@@ -373,6 +416,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
 	struct trace_parser parser;
+	struct seq_file *m = file->private_data;
+	struct trace_array *tr = m->private;
 	ssize_t read, ret;
 
 	if (!cnt)
@@ -395,7 +440,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 
 		parser.buffer[parser.idx] = 0;
 
-		ret = ftrace_set_clr_event(parser.buffer + !set, set);
+		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
 		if (ret)
 			goto out_put;
 	}
@@ -411,17 +456,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct ftrace_event_call *call;
+	struct trace_array *tr = m->private;
 
 	(*pos)++;
 
-	list_for_each_entry_continue(call, &ftrace_events, list) {
+	list_for_each_entry_continue(file, &tr->events, list) {
+		call = file->event_call;
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
 		if (call->class && call->class->reg)
-			return call;
+			return file;
 	}
 
 	return NULL;
@@ -429,30 +477,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+	file = list_entry(&tr->events, struct ftrace_event_file, list);
 	for (l = 0; l <= *pos; ) {
-		call = t_next(m, call, &l);
-		if (!call)
+		file = t_next(m, file, &l);
+		if (!file)
 			break;
 	}
-	return call;
+	return file;
 }
 
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct trace_array *tr = m->private;
 
 	(*pos)++;
 
-	list_for_each_entry_continue(call, &ftrace_events, list) {
-		if (call->flags & TRACE_EVENT_FL_ENABLED)
-			return call;
+	list_for_each_entry_continue(file, &tr->events, list) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+			return file;
 	}
 
 	return NULL;
@@ -460,23 +510,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+	file = list_entry(&tr->events, struct ftrace_event_file, list);
 	for (l = 0; l <= *pos; ) {
-		call = s_next(m, call, &l);
-		if (!call)
+		file = s_next(m, file, &l);
+		if (!file)
 			break;
 	}
-	return call;
+	return file;
 }
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct ftrace_event_call *call = file->event_call;
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
@@ -494,10 +546,10 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_call *call = filp->private_data;
+	struct ftrace_event_file *file = filp->private_data;
 	char *buf;
 
-	if (call->flags & TRACE_EVENT_FL_ENABLED)
+	if (file->flags & FTRACE_EVENT_FL_ENABLED)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -509,10 +561,13 @@ static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_call *call = filp->private_data;
+	struct ftrace_event_file *file = filp->private_data;
 	unsigned long val;
 	int ret;
 
+	if (!file)
+		return -EINVAL;
+
 	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
 	if (ret)
 		return ret;
@@ -525,7 +580,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	case 0:
 	case 1:
 		mutex_lock(&event_mutex);
-		ret = ftrace_event_enable_disable(call, val);
+		ret = ftrace_event_enable_disable(file, val);
 		mutex_unlock(&event_mutex);
 		break;
 
@@ -543,14 +598,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	const char set_to_char[4] = { '?', '0', '1', 'X' };
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = dir->tr;
 	char buf[2];
 	int set = 0;
 	int ret;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+		call = file->event_call;
 		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
@@ -562,7 +621,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
+		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
 
 		/*
 		 * If we have a mixture, no need to look further.
@@ -584,7 +643,8 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		    loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	const char *name = NULL;
 	unsigned long val;
 	ssize_t ret;
@@ -607,7 +667,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (system)
 		name = system->name;
 
-	ret = __ftrace_set_clr_event(NULL, name, NULL, val);
+	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
 	if (ret)
 		goto out;
 
@@ -845,43 +905,75 @@ static LIST_HEAD(event_subsystems);
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
 	struct event_subsystem *system = NULL;
+	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+	struct trace_array *tr;
 	int ret;
 
-	if (!inode->i_private)
-		goto skip_search;
-
 	/* Make sure the system still exists */
 	mutex_lock(&event_mutex);
-	list_for_each_entry(system, &event_subsystems, list) {
-		if (system == inode->i_private) {
-			/* Don't open systems with no events */
-			if (!system->nr_events) {
-				system = NULL;
-				break;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		list_for_each_entry(dir, &tr->systems, list) {
+			if (dir == inode->i_private) {
+				/* Don't open systems with no events */
+				if (dir->nr_events) {
+					__get_system_dir(dir);
+					system = dir->subsystem;
+				}
+				goto exit_loop;
 			}
-			__get_system(system);
-			break;
 		}
 	}
+ exit_loop:
 	mutex_unlock(&event_mutex);
 
-	if (system != inode->i_private)
+	if (!system)
 		return -ENODEV;
 
- skip_search:
+	/* Some versions of gcc think dir can be uninitialized here */
+	WARN_ON(!dir);
+
 	ret = tracing_open_generic(inode, filp);
-	if (ret < 0 && system)
-		put_system(system);
+	if (ret < 0)
+		put_system(dir);
+
+	return ret;
+}
+
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+	struct ftrace_subsystem_dir *dir;
+	struct trace_array *tr = inode->i_private;
+	int ret;
+
+	/* Make a temporary dir that has no system but points to tr */
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return -ENOMEM;
+
+	dir->tr = tr;
+
+	ret = tracing_open_generic(inode, filp);
+	if (ret < 0)
+		kfree(dir);
+
+	filp->private_data = dir;
 
 	return ret;
 }
 
 static int subsystem_release(struct inode *inode, struct file *file)
 {
-	struct event_subsystem *system = inode->i_private;
+	struct ftrace_subsystem_dir *dir = file->private_data;
 
-	if (system)
-		put_system(system);
+	/*
+	 * If dir->subsystem is NULL, then this is a temporary
+	 * descriptor that was made for a trace_array to enable
+	 * all subsystems.
+	 */
+	if (dir->subsystem)
+		put_system(dir);
+	else
+		kfree(dir);
 
 	return 0;
 }
@@ -890,7 +982,8 @@ static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		      loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	struct trace_seq *s;
 	int r;
 
@@ -915,7 +1008,7 @@ static ssize_t
 subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
 	char *buf;
 	int err;
 
@@ -932,7 +1025,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 	buf[cnt] = '\0';
 
-	err = apply_subsystem_event_filter(system, buf);
+	err = apply_subsystem_event_filter(dir, buf);
 	free_page((unsigned long) buf);
 	if (err < 0)
 		return err;
@@ -1041,30 +1134,35 @@ static const struct file_operations ftrace_system_enable_fops = {
 	.release = subsystem_release,
 };
 
+static const struct file_operations ftrace_tr_enable_fops = {
+	.open = system_tr_open,
+	.read = system_enable_read,
+	.write = system_enable_write,
+	.llseek = default_llseek,
+	.release = subsystem_release,
+};
+
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
 	.llseek = default_llseek,
 };
 
-static struct dentry *event_trace_events_dir(void)
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+		  const struct seq_operations *seq_ops)
 {
-	static struct dentry *d_tracer;
-	static struct dentry *d_events;
-
-	if (d_events)
-		return d_events;
-
-	d_tracer = tracing_init_dentry();
-	if (!d_tracer)
-		return NULL;
+	struct seq_file *m;
+	int ret;
 
-	d_events = debugfs_create_dir("events", d_tracer);
-	if (!d_events)
-		pr_warning("Could not create debugfs "
-			   "'events' directory\n");
+	ret = seq_open(file, seq_ops);
+	if (ret < 0)
+		return ret;
+	m = file->private_data;
+	/* copy tr over to seq ops */
+	m->private = inode->i_private;
 
-	return d_events;
+	return ret;
 }
 
 static int
@@ -1072,117 +1170,169 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *seq_ops = &show_event_seq_ops;
 
-	return seq_open(file, seq_ops);
+	return ftrace_event_open(inode, file, seq_ops);
 }
 
 static int
 ftrace_event_set_open(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+	struct trace_array *tr = inode->i_private;
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_clear_events();
+		ftrace_clear_events(tr);
 
-	return seq_open(file, seq_ops);
+	return ftrace_event_open(inode, file, seq_ops);
+}
+
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+	struct event_subsystem *system;
+
+	/* need to create new entry */
+	system = kmalloc(sizeof(*system), GFP_KERNEL);
+	if (!system)
+		return NULL;
+
+	system->ref_count = 1;
+	system->name = kstrdup(name, GFP_KERNEL);
+
+	if (!system->name)
+		goto out_free;
+
+	system->filter = NULL;
+
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter)
+		goto out_free;
+
+	list_add(&system->list, &event_subsystems);
+
+	return system;
+
+ out_free:
+	kfree(system->name);
+	kfree(system);
+	return NULL;
 }
 
 static struct dentry *
-event_subsystem_dir(const char *name, struct dentry *d_events)
+event_subsystem_dir(struct trace_array *tr, const char *name,
+		    struct ftrace_event_file *file, struct dentry *parent)
 {
+	struct ftrace_subsystem_dir *dir;
 	struct event_subsystem *system;
 	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
-	list_for_each_entry(system, &event_subsystems, list) {
+	list_for_each_entry(dir, &tr->systems, list) {
+		system = dir->subsystem;
 		if (strcmp(system->name, name) == 0) {
-			system->nr_events++;
-			return system->entry;
+			dir->nr_events++;
+			file->system = dir;
+			return dir->entry;
 		}
 	}
 
-	/* need to create new entry */
-	system = kmalloc(sizeof(*system), GFP_KERNEL);
-	if (!system) {
-		pr_warning("No memory to create event subsystem %s\n",
-			   name);
-		return d_events;
+	/* Now see if the system itself exists. */
+	list_for_each_entry(system, &event_subsystems, list) {
+		if (strcmp(system->name, name) == 0)
+			break;
 	}
+	/* Reset system variable when not found */
+	if (&system->list == &event_subsystems)
+		system = NULL;
 
-	system->entry = debugfs_create_dir(name, d_events);
-	if (!system->entry) {
-		pr_warning("Could not create event subsystem %s\n",
-			   name);
-		kfree(system);
-		return d_events;
-	}
+	dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		goto out_fail;
 
-	system->nr_events = 1;
-	system->ref_count = 1;
-	system->name = kstrdup(name, GFP_KERNEL);
-	if (!system->name) {
-		debugfs_remove(system->entry);
-		kfree(system);
-		return d_events;
+	if (!system) {
+		system = create_new_subsystem(name);
+		if (!system)
+			goto out_free;
+	} else
+		__get_system(system);
+
+	dir->entry = debugfs_create_dir(name, parent);
+	if (!dir->entry) {
+		pr_warning("Failed to create system directory %s\n", name);
+		__put_system(system);
+		goto out_free;
 	}
 
-	list_add(&system->list, &event_subsystems);
-
-	system->filter = NULL;
-
-	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
-	if (!system->filter) {
-		pr_warning("Could not allocate filter for subsystem "
-			   "'%s'\n", name);
-		return system->entry;
-	}
+	dir->tr = tr;
+	dir->ref_count = 1;
+	dir->nr_events = 1;
+	dir->subsystem = system;
+	file->system = dir;
 
-	entry = debugfs_create_file("filter", 0644, system->entry, system,
+	entry = debugfs_create_file("filter", 0644, dir->entry, dir,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry) {
 		kfree(system->filter);
 		system->filter = NULL;
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", name);
+		pr_warning("Could not create debugfs '%s/filter' entry\n", name);
 	}
 
-	trace_create_file("enable", 0644, system->entry, system,
+	trace_create_file("enable", 0644, dir->entry, dir,
 			  &ftrace_system_enable_fops);
 
-	return system->entry;
+	list_add(&dir->list, &tr->systems);
+
+	return dir->entry;
+
+ out_free:
+	kfree(dir);
+ out_fail:
+	/* Only print this message if failed on memory allocation */
+	if (!dir || !system)
+		pr_warning("No memory to create event subsystem %s\n",
+			   name);
+	return NULL;
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+event_create_dir(struct dentry *parent,
+		 struct ftrace_event_file *file,
 		 const struct file_operations *id,
 		 const struct file_operations *enable,
 		 const struct file_operations *filter,
 		 const struct file_operations *format)
 {
+	struct ftrace_event_call *call = file->event_call;
+	struct trace_array *tr = file->tr;
 	struct list_head *head;
+	struct dentry *d_events;
 	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-		d_events = event_subsystem_dir(call->class->system, d_events);
-
-	call->dir = debugfs_create_dir(call->name, d_events);
-	if (!call->dir) {
-		pr_warning("Could not create debugfs "
-			   "'%s' directory\n", call->name);
+	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
+		d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+		if (!d_events)
+			return -ENOMEM;
+	} else
+		d_events = parent;
+
+	file->dir = debugfs_create_dir(call->name, d_events);
+	if (!file->dir) {
+		pr_warning("Could not create debugfs '%s' directory\n",
+			   call->name);
 		return -1;
 	}
 
 	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-		trace_create_file("enable", 0644, call->dir, call,
+		trace_create_file("enable", 0644, file->dir, file,
 				  enable);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (call->event.type && call->class->reg)
-		trace_create_file("id", 0444, call->dir, call,
+		trace_create_file("id", 0444, file->dir, call,
 		 		  id);
 #endif
 
@@ -1196,23 +1346,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		if (ret < 0) {
 			pr_warning("Could not initialize trace point"
 				   " events/%s\n", call->name);
-			return ret;
+			return -1;
 		}
 	}
-	trace_create_file("filter", 0644, call->dir, call,
+	trace_create_file("filter", 0644, file->dir, call,
 			  filter);
 
-	trace_create_file("format", 0444, call->dir, call,
+	trace_create_file("format", 0444, file->dir, call,
 			  format);
 
 	return 0;
 }
 
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+	if (!dir)
+		return;
+
+	if (!--dir->nr_events) {
+		debugfs_remove_recursive(dir->entry);
+		list_del(&dir->list);
+		__put_system_dir(dir);
+	}
+}
+
+static void remove_event_from_tracers(struct ftrace_event_call *call)
+{
+	struct ftrace_event_file *file;
+	struct trace_array *tr;
+
+	do_for_each_event_file_safe(tr, file) {
+
+		if (file->event_call != call)
+			continue;
+
+		list_del(&file->list);
+		debugfs_remove_recursive(file->dir);
+		remove_subsystem(file->system);
+		kfree(file);
+
+		/*
+		 * The do_for_each_event_file_safe() is
+		 * a double loop. After finding the call for this
+		 * trace_array, we use break to jump to the next
+		 * trace_array.
+		 */
+		break;
+	} while_for_each_event_file();
+}
+
 static void event_remove(struct ftrace_event_call *call)
 {
-	ftrace_event_enable_disable(call, 0);
+	struct trace_array *tr;
+	struct ftrace_event_file *file;
+
+	do_for_each_event_file(tr, file) {
+		if (file->event_call != call)
+			continue;
+		ftrace_event_enable_disable(file, 0);
+		/*
+		 * The do_for_each_event_file() is
+		 * a double loop. After finding the call for this
+		 * trace_array, we use break to jump to the next
+		 * trace_array.
+		 */
+		break;
+	} while_for_each_event_file();
+
 	if (call->event.funcs)
 		__unregister_ftrace_event(&call->event);
+	remove_event_from_tracers(call);
 	list_del(&call->list);
 }
 
@@ -1234,61 +1437,58 @@ static int event_init(struct ftrace_event_call *call)
 }
 
 static int
-__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
-		       const struct file_operations *id,
-		       const struct file_operations *enable,
-		       const struct file_operations *filter,
-		       const struct file_operations *format)
+__register_event(struct ftrace_event_call *call, struct module *mod)
 {
-	struct dentry *d_events;
 	int ret;
 
 	ret = event_init(call);
 	if (ret < 0)
 		return ret;
 
-	d_events = event_trace_events_dir();
-	if (!d_events)
-		return -ENOENT;
-
-	ret = event_create_dir(call, d_events, id, enable, filter, format);
-	if (!ret)
-		list_add(&call->list, &ftrace_events);
+	list_add(&call->list, &ftrace_events);
 	call->mod = mod;
 
-	return ret;
+	return 0;
 }
 
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call,
+		      struct trace_array *tr,
+		      const struct file_operations *id,
+		      const struct file_operations *enable,
+		      const struct file_operations *filter,
+		      const struct file_operations *format)
+{
+	struct ftrace_event_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->event_call = call;
+	file->tr = tr;
+	list_add(&file->list, &tr->events);
+
+	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
+}
+
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call,
+				   struct ftrace_module_file_ops *file_ops);
+
 /* Add an additional event_call dynamically */
 int trace_add_event_call(struct ftrace_event_call *call)
 {
 	int ret;
 	mutex_lock(&event_mutex);
-	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
-				     &ftrace_enable_fops,
-				     &ftrace_event_filter_fops,
-				     &ftrace_event_format_fops);
-	mutex_unlock(&event_mutex);
-	return ret;
-}
 
-static void remove_subsystem_dir(const char *name)
-{
-	struct event_subsystem *system;
+	ret = __register_event(call, NULL);
+	if (ret >= 0)
+		__add_event_to_tracers(call, NULL);
 
-	if (strcmp(name, TRACE_SYSTEM) == 0)
-		return;
-
-	list_for_each_entry(system, &event_subsystems, list) {
-		if (strcmp(system->name, name) == 0) {
-			if (!--system->nr_events) {
-				debugfs_remove_recursive(system->entry);
-				list_del(&system->list);
-				__put_system(system);
-			}
-			break;
-		}
-	}
+	mutex_unlock(&event_mutex);
+	return ret;
 }
 
 /*
@@ -1299,8 +1499,6 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 	event_remove(call);
 	trace_destroy_fields(call);
 	destroy_preds(call);
-	debugfs_remove_recursive(call->dir);
-	remove_subsystem_dir(call->class->system);
 }
 
 /* Remove an event_call */
@@ -1335,6 +1533,17 @@ struct ftrace_module_file_ops {
 	struct file_operations		filter;
 };
 
+static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			return file_ops;
+	}
+	return NULL;
+}
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1386,9 +1595,8 @@ static void trace_module_add_events(struct module *mod)
 		return;
 
 	for_each_event(call, start, end) {
-		__trace_add_event_call(*call, mod,
-				       &file_ops->id, &file_ops->enable,
-				       &file_ops->filter, &file_ops->format);
+		__register_event(*call, mod);
+		__add_event_to_tracers(*call, file_ops);
 	}
 }
 
@@ -1444,6 +1652,10 @@ static int trace_module_notify(struct notifier_block *self,
 	return 0;
 }
 #else
+static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+{
+	return NULL;
+}
 static int trace_module_notify(struct notifier_block *self,
 			       unsigned long val, void *data)
 {
@@ -1451,6 +1663,72 @@ static int trace_module_notify(struct notifier_block *self,
 }
 #endif /* CONFIG_MODULES */
 
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+	struct ftrace_module_file_ops *file_ops = NULL;
+	struct ftrace_event_call *call;
+	int ret;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (call->mod) {
+			/*
+			 * Directories for events by modules need to
+			 * keep module ref counts when opened (as we don't
+			 * want the module to disappear when reading one
+			 * of these files). The file_ops keep account of
+			 * the module ref count.
+			 *
+			 * As event_calls are added in groups by module,
+			 * when we find one file_ops, we don't need to search for
+			 * each call in that module, as the rest should be the
+			 * same. Only search for a new one if the last one did
+			 * not match.
+			 */
+			if (!file_ops || call->mod != file_ops->mod)
+				file_ops = find_ftrace_file_ops(call->mod);
+			if (!file_ops)
+				continue; /* Warn? */
+			ret = __trace_add_new_event(call, tr,
+					&file_ops->id, &file_ops->enable,
+					&file_ops->filter, &file_ops->format);
+			if (ret < 0)
+				pr_warning("Could not create directory for event %s\n",
+					   call->name);
+			continue;
+		}
+		ret = __trace_add_new_event(call, tr,
+					    &ftrace_event_id_fops,
+					    &ftrace_enable_fops,
+					    &ftrace_event_filter_fops,
+					    &ftrace_event_format_fops);
+		if (ret < 0)
+			pr_warning("Could not create directory for event %s\n",
+				   call->name);
+	}
+}
+
+static void
+__add_event_to_tracers(struct ftrace_event_call *call,
+		       struct ftrace_module_file_ops *file_ops)
+{
+	struct trace_array *tr;
+
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (file_ops)
+			__trace_add_new_event(call, tr,
+					      &file_ops->id, &file_ops->enable,
+					      &file_ops->filter, &file_ops->format);
+		else
+			__trace_add_new_event(call, tr,
+					      &ftrace_event_id_fops,
+					      &ftrace_enable_fops,
+					      &ftrace_event_filter_fops,
+					      &ftrace_event_format_fops);
+	}
+}
+
 static struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
 	.priority = 0,
@@ -1471,8 +1749,43 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	struct dentry *d_events;
+	struct dentry *entry;
+
+	entry = debugfs_create_file("set_event", 0644, parent,
+				    tr, &ftrace_set_event_fops);
+	if (!entry) {
+		pr_warning("Could not create debugfs 'set_event' entry\n");
+		return -ENOMEM;
+	}
+
+	d_events = debugfs_create_dir("events", parent);
+	if (!d_events)
+		pr_warning("Could not create debugfs 'events' directory\n");
+
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("enable", 0644, d_events,
+			  tr, &ftrace_tr_enable_fops);
+
+	tr->event_dir = d_events;
+	__trace_add_event_dirs(tr);
+
+	return 0;
+}
+
 static __init int event_trace_enable(void)
 {
+	struct trace_array *tr = top_trace_array();
 	struct ftrace_event_call **iter, *call;
 	char *buf = bootup_event_buf;
 	char *token;
@@ -1494,7 +1807,7 @@ static __init int event_trace_enable(void)
 		if (!*token)
 			continue;
 
-		ret = ftrace_set_clr_event(token, 1);
+		ret = ftrace_set_clr_event(tr, token, 1);
 		if (ret)
 			pr_warn("Failed to enable trace event: %s\n", token);
 	}
@@ -1506,61 +1819,29 @@ static __init int event_trace_enable(void)
 
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call;
+	struct trace_array *tr;
 	struct dentry *d_tracer;
 	struct dentry *entry;
-	struct dentry *d_events;
 	int ret;
 
+	tr = top_trace_array();
+
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 		return 0;
 
 	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    NULL, &ftrace_avail_fops);
+				    tr, &ftrace_avail_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'available_events' entry\n");
 
-	entry = debugfs_create_file("set_event", 0644, d_tracer,
-				    NULL, &ftrace_set_event_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_event' entry\n");
-
-	d_events = event_trace_events_dir();
-	if (!d_events)
-		return 0;
-
-	/* ring buffer internal formats */
-	trace_create_file("header_page", 0444, d_events,
-			  ring_buffer_print_page_header,
-			  &ftrace_show_header_fops);
-
-	trace_create_file("header_event", 0444, d_events,
-			  ring_buffer_print_entry_header,
-			  &ftrace_show_header_fops);
-
-	trace_create_file("enable", 0644, d_events,
-			  NULL, &ftrace_system_enable_fops);
-
 	if (trace_define_common_fields())
 		pr_warning("tracing: Failed to allocate common fields");
 
-	/*
-	 * Early initialization already enabled ftrace event.
-	 * Now it's only necessary to create the event directory.
-	 */
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		ret = event_create_dir(call, d_events,
-				       &ftrace_event_id_fops,
-				       &ftrace_enable_fops,
-				       &ftrace_event_filter_fops,
-				       &ftrace_event_format_fops);
-		if (ret < 0)
-			event_remove(call);
-	}
+	ret = event_trace_add_tracer(d_tracer, tr);
+	if (ret)
+		return ret;
 
 	ret = register_module_notifier(&trace_module_nb);
 	if (ret)
@@ -1627,13 +1908,20 @@ static __init void event_test_stuff(void)
  */
 static __init void event_trace_self_tests(void)
 {
+	struct ftrace_subsystem_dir *dir;
+	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
+	struct trace_array *tr;
 	int ret;
 
+	tr = top_trace_array();
+
 	pr_info("Running tests on trace events:\n");
 
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+
+		call = file->event_call;
 
 		/* Only test those that have a probe */
 		if (!call->class || !call->class->probe)
@@ -1657,15 +1945,15 @@ static __init void event_trace_self_tests(void)
 		 * If an event is already enabled, someone is using
 		 * it and the self test should not be on.
 		 */
-		if (call->flags & TRACE_EVENT_FL_ENABLED) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
 			pr_warning("Enabled event during self test!\n");
 			WARN_ON_ONCE(1);
 			continue;
 		}
 
-		ftrace_event_enable_disable(call, 1);
+		ftrace_event_enable_disable(file, 1);
 		event_test_stuff();
-		ftrace_event_enable_disable(call, 0);
+		ftrace_event_enable_disable(file, 0);
 
 		pr_cont("OK\n");
 	}
@@ -1674,7 +1962,9 @@ static __init void event_trace_self_tests(void)
 
 	pr_info("Running tests on trace event systems:\n");
 
-	list_for_each_entry(system, &event_subsystems, list) {
+	list_for_each_entry(dir, &tr->systems, list) {
+
+		system = dir->subsystem;
 
 		/* the ftrace system is special, skip it */
 		if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +1972,7 @@ static __init void event_trace_self_tests(void)
 
 		pr_info("Testing event system %s: ", system->name);
 
-		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error enabling system %s\n",
 				   system->name);
@@ -1691,7 +1981,7 @@ static __init void event_trace_self_tests(void)
 
 		event_test_stuff();
 
-		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error disabling system %s\n",
 				   system->name);
@@ -1706,7 +1996,7 @@ static __init void event_trace_self_tests(void)
 	pr_info("Running tests on all trace events:\n");
 	pr_info("Testing all events: ");
 
-	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error enabling all events\n");
 		return;
@@ -1715,7 +2005,7 @@ static __init void event_trace_self_tests(void)
 	event_test_stuff();
 
 	/* reset sysname */
-	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
 		return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..2a22a177ab44 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1907,16 +1907,17 @@ out_unlock:
 	return err;
 }
 
-int apply_subsystem_event_filter(struct event_subsystem *system,
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 				 char *filter_string)
 {
+	struct event_subsystem *system = dir->subsystem;
 	struct event_filter *filter;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
 
 	/* Make sure the system still has events */
-	if (!system->nr_events) {
+	if (!dir->nr_events) {
 		err = -ENODEV;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From ae3b5093ad6004b52e2825f3db1ad8200a2724d8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 23 Jan 2013 15:22:59 -0500
Subject: tracing: Use RING_BUFFER_ALL_CPUS for TRACE_PIPE_ALL_CPU

Both RING_BUFFER_ALL_CPUS and TRACE_PIPE_ALL_CPU are defined as
-1 and used to say that all the ring buffers are to be modified
or read (instead of just a single cpu, which would be >= 0).

There's no reason to keep TRACE_PIPE_ALL_CPU as it is also started
to be used for more than what it was created for, and now that
the ring buffer code added a generic RING_BUFFER_ALL_CPUS define,
we can clean up the trace code to use that instead and remove
the TRACE_PIPE_ALL_CPU macro.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c     | 28 ++++++++++++++--------------
 kernel/trace/trace.h     |  2 --
 kernel/trace/trace_kdb.c |  4 ++--
 3 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 932931897b8d..59953aa28845 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -287,13 +287,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
 
 static inline void trace_access_lock(int cpu)
 {
-	if (cpu == TRACE_PIPE_ALL_CPU) {
+	if (cpu == RING_BUFFER_ALL_CPUS) {
 		/* gain it for accessing the whole ring buffer. */
 		down_write(&all_cpu_access_lock);
 	} else {
 		/* gain it for accessing a cpu ring buffer. */
 
-		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+		/* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
 		down_read(&all_cpu_access_lock);
 
 		/* Secondly block other access to this @cpu ring buffer. */
@@ -303,7 +303,7 @@ static inline void trace_access_lock(int cpu)
 
 static inline void trace_access_unlock(int cpu)
 {
-	if (cpu == TRACE_PIPE_ALL_CPU) {
+	if (cpu == RING_BUFFER_ALL_CPUS) {
 		up_write(&all_cpu_access_lock);
 	} else {
 		mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -1823,7 +1823,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 	 * If we are in a per_cpu trace file, don't bother by iterating over
 	 * all cpu and peek directly.
 	 */
-	if (cpu_file > TRACE_PIPE_ALL_CPU) {
+	if (cpu_file > RING_BUFFER_ALL_CPUS) {
 		if (ring_buffer_empty_cpu(buffer, cpu_file))
 			return NULL;
 		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1983,7 +1983,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		iter->cpu = 0;
 		iter->idx = -1;
 
-		if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		if (cpu_file == RING_BUFFER_ALL_CPUS) {
 			for_each_tracing_cpu(cpu)
 				tracing_iter_reset(iter, cpu);
 		} else
@@ -2291,7 +2291,7 @@ int trace_empty(struct trace_iterator *iter)
 	int cpu;
 
 	/* If we are looking at one CPU buffer, only check that one */
-	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+	if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
 		cpu = iter->cpu_file;
 		buf_iter = trace_buffer_iter(iter, cpu);
 		if (buf_iter) {
@@ -2533,7 +2533,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!iter->snapshot)
 		tracing_stop();
 
-	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
 				ring_buffer_read_prepare(iter->tr->buffer, cpu);
@@ -2617,7 +2617,7 @@ static int tracing_open(struct inode *inode, struct file *file)
 	    (file->f_flags & O_TRUNC)) {
 		long cpu = (long) inode->i_private;
 
-		if (cpu == TRACE_PIPE_ALL_CPU)
+		if (cpu == RING_BUFFER_ALL_CPUS)
 			tracing_reset_online_cpus(&global_trace);
 		else
 			tracing_reset(&global_trace, cpu);
@@ -5035,7 +5035,7 @@ static __init int tracer_init_debugfs(void)
 			NULL, &tracing_cpumask_fops);
 
 	trace_create_file("trace", 0644, d_tracer,
-			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+			(void *) RING_BUFFER_ALL_CPUS, &tracing_fops);
 
 	trace_create_file("available_tracers", 0444, d_tracer,
 			&global_trace, &show_traces_fops);
@@ -5055,7 +5055,7 @@ static __init int tracer_init_debugfs(void)
 			NULL, &tracing_readme_fops);
 
 	trace_create_file("trace_pipe", 0444, d_tracer,
-			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
+			(void *) RING_BUFFER_ALL_CPUS, &tracing_pipe_fops);
 
 	trace_create_file("buffer_size_kb", 0644, d_tracer,
 			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
@@ -5085,7 +5085,7 @@ static __init int tracer_init_debugfs(void)
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 	trace_create_file("snapshot", 0644, d_tracer,
-			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+			  (void *) RING_BUFFER_ALL_CPUS, &snapshot_fops);
 #endif
 
 	create_trace_options_dir();
@@ -5162,7 +5162,7 @@ void trace_init_global_iter(struct trace_iterator *iter)
 {
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
-	iter->cpu_file = TRACE_PIPE_ALL_CPU;
+	iter->cpu_file = RING_BUFFER_ALL_CPUS;
 }
 
 static void
@@ -5210,7 +5210,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 
 	switch (oops_dump_mode) {
 	case DUMP_ALL:
-		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+		iter.cpu_file = RING_BUFFER_ALL_CPUS;
 		break;
 	case DUMP_ORIG:
 		iter.cpu_file = raw_smp_processor_id();
@@ -5219,7 +5219,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		goto out_enable;
 	default:
 		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
-		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+		iter.cpu_file = RING_BUFFER_ALL_CPUS;
 	}
 
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 037f7eb03d69..da09a037abcd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -453,8 +453,6 @@ static __always_inline void trace_clear_recursion(int bit)
 	current->trace_recursion = val;
 }
 
-#define TRACE_PIPE_ALL_CPU	-1
-
 static inline struct ring_buffer_iter *
 trace_buffer_iter(struct trace_iterator *iter, int cpu)
 {
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..cc1dbdc5ee5d 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,7 +43,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	iter.iter_flags |= TRACE_FILE_LAT_FMT;
 	iter.pos = -1;
 
-	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
 			ring_buffer_read_prepare(iter.tr->buffer, cpu);
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
 		    !cpu_online(cpu_file))
 			return KDB_BADINT;
 	} else {
-		cpu_file = TRACE_PIPE_ALL_CPU;
+		cpu_file = RING_BUFFER_ALL_CPUS;
 	}
 
 	kdb_trap_printk++;
-- 
cgit v1.2.3


From 2b6080f28c7cc3efc8625ab71495aae89aeb63a0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 May 2012 13:29:49 -0400
Subject: tracing: Encapsulate global_trace and remove dependencies on global
 vars

The global_trace variable in kernel/trace/trace.c has been kept 'static' and
local to that file so that it would not be used too much outside of that
file. This has paid off, even though there were lots of changes to make
the trace_array structure more generic (not depending on global_trace).

Removal of a lot of direct usages of global_trace is needed to be able to
create more trace_arrays such that we can add multiple buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 561 ++++++++++++++++++++++----------------
 kernel/trace/trace.h              |  21 +-
 kernel/trace/trace_irqsoff.c      |   8 +-
 kernel/trace/trace_sched_wakeup.c |   8 +-
 4 files changed, 358 insertions(+), 240 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 59953aa28845..91fe40905828 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
 /*
  * ring buffer based function tracer
  *
- * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
  *
  * Originally taken from the RT patch by:
@@ -251,9 +251,6 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
 /* trace_types holds a link list of available tracers. */
 static struct tracer		*trace_types __read_mostly;
 
-/* current_trace points to the tracer that is currently active */
-static struct tracer		*current_trace __read_mostly = &nop_trace;
-
 /*
  * trace_types_lock is used to protect the trace_types list.
  */
@@ -350,9 +347,6 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
 	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
 
-static int trace_stop_count;
-static DEFINE_RAW_SPINLOCK(tracing_start_lock);
-
 /**
  * trace_wake_up - wake up tasks waiting for trace input
  *
@@ -708,14 +702,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct ring_buffer *buf;
 
-	if (trace_stop_count)
+	if (tr->stop_count)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	if (!current_trace->allocated_snapshot) {
+	if (!tr->current_trace->allocated_snapshot) {
 		/* Only the nop tracer should hit this when disabling */
-		WARN_ON_ONCE(current_trace != &nop_trace);
+		WARN_ON_ONCE(tr->current_trace != &nop_trace);
 		return;
 	}
 
@@ -742,11 +736,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	int ret;
 
-	if (trace_stop_count)
+	if (tr->stop_count)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+	if (WARN_ON_ONCE(!tr->current_trace->allocated_snapshot))
 		return;
 
 	arch_spin_lock(&ftrace_max_lock);
@@ -853,8 +847,8 @@ int register_tracer(struct tracer *type)
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest && !tracing_selftest_disabled) {
-		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
+		struct tracer *saved_tracer = tr->current_trace;
 
 		/*
 		 * Run a selftest on this tracer.
@@ -865,7 +859,7 @@ int register_tracer(struct tracer *type)
 		 */
 		tracing_reset_online_cpus(tr);
 
-		current_trace = type;
+		tr->current_trace = type;
 
 		if (type->use_max_tr) {
 			/* If we expanded the buffers, make sure the max is expanded too */
@@ -879,7 +873,7 @@ int register_tracer(struct tracer *type)
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
 		/* the test is responsible for resetting too */
-		current_trace = saved_tracer;
+		tr->current_trace = saved_tracer;
 		if (ret) {
 			printk(KERN_CONT "FAILED!\n");
 			/* Add the warning after printing 'FAILED' */
@@ -997,7 +991,7 @@ static void trace_init_cmdlines(void)
 
 int is_tracing_stopped(void)
 {
-	return trace_stop_count;
+	return global_trace.stop_count;
 }
 
 /**
@@ -1029,12 +1023,12 @@ void tracing_start(void)
 	if (tracing_disabled)
 		return;
 
-	raw_spin_lock_irqsave(&tracing_start_lock, flags);
-	if (--trace_stop_count) {
-		if (trace_stop_count < 0) {
+	raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+	if (--global_trace.stop_count) {
+		if (global_trace.stop_count < 0) {
 			/* Someone screwed up their debugging */
 			WARN_ON_ONCE(1);
-			trace_stop_count = 0;
+			global_trace.stop_count = 0;
 		}
 		goto out;
 	}
@@ -1054,7 +1048,38 @@ void tracing_start(void)
 
 	ftrace_start();
  out:
-	raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_start_tr(struct trace_array *tr)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	if (tracing_disabled)
+		return;
+
+	/* If global, we need to also start the max tracer */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+		return tracing_start();
+
+	raw_spin_lock_irqsave(&tr->start_lock, flags);
+
+	if (--tr->stop_count) {
+		if (tr->stop_count < 0) {
+			/* Someone screwed up their debugging */
+			WARN_ON_ONCE(1);
+			tr->stop_count = 0;
+		}
+		goto out;
+	}
+
+	buffer = tr->buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+ out:
+	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 
 /**
@@ -1069,8 +1094,8 @@ void tracing_stop(void)
 	unsigned long flags;
 
 	ftrace_stop();
-	raw_spin_lock_irqsave(&tracing_start_lock, flags);
-	if (trace_stop_count++)
+	raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+	if (global_trace.stop_count++)
 		goto out;
 
 	/* Prevent the buffers from switching */
@@ -1087,7 +1112,28 @@ void tracing_stop(void)
 	arch_spin_unlock(&ftrace_max_lock);
 
  out:
-	raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	/* If global, we need to also stop the max tracer */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+		return tracing_stop();
+
+	raw_spin_lock_irqsave(&tr->start_lock, flags);
+	if (tr->stop_count++)
+		goto out;
+
+	buffer = tr->buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+ out:
+	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 
 void trace_stop_cmdline_recording(void);
@@ -1956,6 +2002,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	struct trace_array *tr = iter->tr;
 	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
@@ -1968,8 +2015,8 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 	 * will point to the same string as current_trace->name.
 	 */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(current_trace && iter->trace->name != current_trace->name))
-		*iter->trace = *current_trace;
+	if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+		*iter->trace = *tr->current_trace;
 	mutex_unlock(&trace_types_lock);
 
 	if (iter->snapshot && iter->trace->use_max_tr)
@@ -2099,7 +2146,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_array *tr = iter->tr;
 	struct trace_array_cpu *data = tr->data[tr->cpu];
-	struct tracer *type = current_trace;
+	struct tracer *type = iter->trace;
 	unsigned long entries;
 	unsigned long total;
 	const char *name = "preemption";
@@ -2478,7 +2525,8 @@ static const struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
-	long cpu_file = (long) inode->i_private;
+	struct trace_cpu *tc = inode->i_private;
+	struct trace_array *tr = tc->tr;
 	struct trace_iterator *iter;
 	int cpu;
 
@@ -2503,19 +2551,20 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!iter->trace)
 		goto fail;
 
-	*iter->trace = *current_trace;
+	*iter->trace = *tr->current_trace;
 
 	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
-	if (current_trace->print_max || snapshot)
+	/* Currently only the top directory has a snapshot */
+	if (tr->current_trace->print_max || snapshot)
 		iter->tr = &max_tr;
 	else
-		iter->tr = &global_trace;
+		iter->tr = tr;
 	iter->snapshot = snapshot;
 	iter->pos = -1;
 	mutex_init(&iter->mutex);
-	iter->cpu_file = cpu_file;
+	iter->cpu_file = tc->cpu;
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
@@ -2531,7 +2580,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 
 	/* stop the trace while dumping if we are not opening "snapshot" */
 	if (!iter->snapshot)
-		tracing_stop();
+		tracing_stop_tr(tr);
 
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
@@ -2578,6 +2627,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = file->private_data;
 	struct trace_iterator *iter;
+	struct trace_array *tr;
 	int cpu;
 
 	if (!(file->f_mode & FMODE_READ))
@@ -2585,6 +2635,12 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	iter = m->private;
 
+	/* Only the global tracer has a matching max_tr */
+	if (iter->tr == &max_tr)
+		tr = &global_trace;
+	else
+		tr = iter->tr;
+
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu])
@@ -2596,7 +2652,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	if (!iter->snapshot)
 		/* reenable tracing if it was previously enabled */
-		tracing_start();
+		tracing_start_tr(tr);
 	mutex_unlock(&trace_types_lock);
 
 	mutex_destroy(&iter->mutex);
@@ -2615,12 +2671,13 @@ static int tracing_open(struct inode *inode, struct file *file)
 	/* If this file was open for write, then erase contents */
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC)) {
-		long cpu = (long) inode->i_private;
+		struct trace_cpu *tc = inode->i_private;
+		struct trace_array *tr = tc->tr;
 
-		if (cpu == RING_BUFFER_ALL_CPUS)
-			tracing_reset_online_cpus(&global_trace);
+		if (tc->cpu == RING_BUFFER_ALL_CPUS)
+			tracing_reset_online_cpus(tr);
 		else
-			tracing_reset(&global_trace, cpu);
+			tracing_reset(tr, tc->cpu);
 	}
 
 	if (file->f_mode & FMODE_READ) {
@@ -2767,8 +2824,9 @@ static ssize_t
 tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		      size_t count, loff_t *ppos)
 {
-	int err, cpu;
+	struct trace_array *tr = filp->private_data;
 	cpumask_var_t tracing_cpumask_new;
+	int err, cpu;
 
 	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
 		return -ENOMEM;
@@ -2788,13 +2846,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		 */
 		if (cpumask_test_cpu(cpu, tracing_cpumask) &&
 				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_inc(&global_trace.data[cpu]->disabled);
-			ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
+			atomic_inc(&tr->data[cpu]->disabled);
+			ring_buffer_record_disable_cpu(tr->buffer, cpu);
 		}
 		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
 				cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_dec(&global_trace.data[cpu]->disabled);
-			ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
+			atomic_dec(&tr->data[cpu]->disabled);
+			ring_buffer_record_enable_cpu(tr->buffer, cpu);
 		}
 	}
 	arch_spin_unlock(&ftrace_max_lock);
@@ -2823,12 +2881,13 @@ static const struct file_operations tracing_cpumask_fops = {
 static int tracing_trace_options_show(struct seq_file *m, void *v)
 {
 	struct tracer_opt *trace_opts;
+	struct trace_array *tr = m->private;
 	u32 tracer_flags;
 	int i;
 
 	mutex_lock(&trace_types_lock);
-	tracer_flags = current_trace->flags->val;
-	trace_opts = current_trace->flags->opts;
+	tracer_flags = tr->current_trace->flags->val;
+	trace_opts = tr->current_trace->flags->opts;
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
@@ -2892,15 +2951,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
 	return 0;
 }
 
-int set_tracer_flag(unsigned int mask, int enabled)
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 {
 	/* do nothing if flag is already set */
 	if (!!(trace_flags & mask) == !!enabled)
 		return 0;
 
 	/* Give the tracer a chance to approve the change */
-	if (current_trace->flag_changed)
-		if (current_trace->flag_changed(current_trace, mask, !!enabled))
+	if (tr->current_trace->flag_changed)
+		if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
 			return -EINVAL;
 
 	if (enabled)
@@ -2924,7 +2983,7 @@ int set_tracer_flag(unsigned int mask, int enabled)
 	return 0;
 }
 
-static int trace_set_options(char *option)
+static int trace_set_options(struct trace_array *tr, char *option)
 {
 	char *cmp;
 	int neg = 0;
@@ -2942,14 +3001,14 @@ static int trace_set_options(char *option)
 
 	for (i = 0; trace_options[i]; i++) {
 		if (strcmp(cmp, trace_options[i]) == 0) {
-			ret = set_tracer_flag(1 << i, !neg);
+			ret = set_tracer_flag(tr, 1 << i, !neg);
 			break;
 		}
 	}
 
 	/* If no option could be set, test the specific tracer options */
 	if (!trace_options[i])
-		ret = set_tracer_option(current_trace, cmp, neg);
+		ret = set_tracer_option(tr->current_trace, cmp, neg);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -2960,6 +3019,8 @@ static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
 {
+	struct seq_file *m = filp->private_data;
+	struct trace_array *tr = m->private;
 	char buf[64];
 	int ret;
 
@@ -2971,7 +3032,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	ret = trace_set_options(buf);
+	ret = trace_set_options(tr, buf);
 	if (ret < 0)
 		return ret;
 
@@ -2984,7 +3045,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
 {
 	if (tracing_disabled)
 		return -ENODEV;
-	return single_open(file, tracing_trace_options_show, NULL);
+
+	return single_open(file, tracing_trace_options_show, inode->i_private);
 }
 
 static const struct file_operations tracing_iter_fops = {
@@ -3082,11 +3144,12 @@ static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
+	struct trace_array *tr = filp->private_data;
 	char buf[MAX_TRACER_SIZE+2];
 	int r;
 
 	mutex_lock(&trace_types_lock);
-	r = sprintf(buf, "%s\n", current_trace->name);
+	r = sprintf(buf, "%s\n", tr->current_trace->name);
 	mutex_unlock(&trace_types_lock);
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3130,7 +3193,8 @@ static int resize_buffer_duplicate_size(struct trace_array *tr,
 	return ret;
 }
 
-static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
+static int __tracing_resize_ring_buffer(struct trace_array *tr,
+					unsigned long size, int cpu)
 {
 	int ret;
 
@@ -3142,20 +3206,20 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 	ring_buffer_expanded = 1;
 
 	/* May be called before buffers are initialized */
-	if (!global_trace.buffer)
+	if (!tr->buffer)
 		return 0;
 
-	ret = ring_buffer_resize(global_trace.buffer, size, cpu);
+	ret = ring_buffer_resize(tr->buffer, size, cpu);
 	if (ret < 0)
 		return ret;
 
-	if (!current_trace->use_max_tr)
+	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
+	    !tr->current_trace->use_max_tr)
 		goto out;
 
 	ret = ring_buffer_resize(max_tr.buffer, size, cpu);
 	if (ret < 0) {
-		int r = resize_buffer_duplicate_size(&global_trace,
-						     &global_trace, cpu);
+		int r = resize_buffer_duplicate_size(tr, tr, cpu);
 		if (r < 0) {
 			/*
 			 * AARGH! We are left with different
@@ -3184,14 +3248,15 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 
  out:
 	if (cpu == RING_BUFFER_ALL_CPUS)
-		set_buffer_entries(&global_trace, size);
+		set_buffer_entries(tr, size);
 	else
-		global_trace.data[cpu]->entries = size;
+		tr->data[cpu]->entries = size;
 
 	return ret;
 }
 
-static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
+static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+					  unsigned long size, int cpu_id)
 {
 	int ret = size;
 
@@ -3205,7 +3270,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
 		}
 	}
 
-	ret = __tracing_resize_ring_buffer(size, cpu_id);
+	ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
 	if (ret < 0)
 		ret = -ENOMEM;
 
@@ -3232,7 +3297,7 @@ int tracing_update_buffers(void)
 
 	mutex_lock(&trace_types_lock);
 	if (!ring_buffer_expanded)
-		ret = __tracing_resize_ring_buffer(trace_buf_size,
+		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
 						RING_BUFFER_ALL_CPUS);
 	mutex_unlock(&trace_types_lock);
 
@@ -3242,7 +3307,7 @@ int tracing_update_buffers(void)
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
 
 static void
 destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3258,7 +3323,7 @@ static int tracing_set_tracer(const char *buf)
 	mutex_lock(&trace_types_lock);
 
 	if (!ring_buffer_expanded) {
-		ret = __tracing_resize_ring_buffer(trace_buf_size,
+		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
 						RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
 			goto out;
@@ -3273,18 +3338,18 @@ static int tracing_set_tracer(const char *buf)
 		ret = -EINVAL;
 		goto out;
 	}
-	if (t == current_trace)
+	if (t == tr->current_trace)
 		goto out;
 
 	trace_branch_disable();
 
-	current_trace->enabled = false;
+	tr->current_trace->enabled = false;
 
-	if (current_trace->reset)
-		current_trace->reset(tr);
+	if (tr->current_trace->reset)
+		tr->current_trace->reset(tr);
 
-	had_max_tr = current_trace->allocated_snapshot;
-	current_trace = &nop_trace;
+	had_max_tr = tr->current_trace->allocated_snapshot;
+	tr->current_trace = &nop_trace;
 
 	if (had_max_tr && !t->use_max_tr) {
 		/*
@@ -3303,11 +3368,11 @@ static int tracing_set_tracer(const char *buf)
 		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
 		set_buffer_entries(&max_tr, 1);
 		tracing_reset_online_cpus(&max_tr);
-		current_trace->allocated_snapshot = false;
+		tr->current_trace->allocated_snapshot = false;
 	}
 	destroy_trace_option_files(topts);
 
-	topts = create_trace_option_files(t);
+	topts = create_trace_option_files(tr, t);
 	if (t->use_max_tr && !had_max_tr) {
 		/* we need to make per cpu buffer sizes equivalent */
 		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
@@ -3323,8 +3388,8 @@ static int tracing_set_tracer(const char *buf)
 			goto out;
 	}
 
-	current_trace = t;
-	current_trace->enabled = true;
+	tr->current_trace = t;
+	tr->current_trace->enabled = true;
 	trace_branch_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
@@ -3398,7 +3463,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
-	long cpu_file = (long) inode->i_private;
+	struct trace_cpu *tc = inode->i_private;
+	struct trace_array *tr = tc->tr;
 	struct trace_iterator *iter;
 	int ret = 0;
 
@@ -3423,7 +3489,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		ret = -ENOMEM;
 		goto fail;
 	}
-	*iter->trace = *current_trace;
+	*iter->trace = *tr->current_trace;
 
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
 		ret = -ENOMEM;
@@ -3440,8 +3506,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	if (trace_clocks[trace_clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
-	iter->cpu_file = cpu_file;
-	iter->tr = &global_trace;
+	iter->cpu_file = tc->cpu;
+	iter->tr = tc->tr;
 	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
@@ -3563,6 +3629,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
+	struct trace_array *tr = iter->tr;
 	ssize_t sret;
 
 	/* return any leftover data */
@@ -3574,8 +3641,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(iter->trace->name != current_trace->name))
-		*iter->trace = *current_trace;
+	if (unlikely(iter->trace->name != tr->current_trace->name))
+		*iter->trace = *tr->current_trace;
 	mutex_unlock(&trace_types_lock);
 
 	/*
@@ -3731,6 +3798,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
+	struct trace_array *tr = iter->tr;
 	ssize_t ret;
 	size_t rem;
 	unsigned int i;
@@ -3740,8 +3808,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-	if (unlikely(iter->trace->name != current_trace->name))
-		*iter->trace = *current_trace;
+	if (unlikely(iter->trace->name != tr->current_trace->name))
+		*iter->trace = *tr->current_trace;
 	mutex_unlock(&trace_types_lock);
 
 	mutex_lock(&iter->mutex);
@@ -3803,43 +3871,19 @@ out_err:
 	goto out;
 }
 
-struct ftrace_entries_info {
-	struct trace_array	*tr;
-	int			cpu;
-};
-
-static int tracing_entries_open(struct inode *inode, struct file *filp)
-{
-	struct ftrace_entries_info *info;
-
-	if (tracing_disabled)
-		return -ENODEV;
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	info->tr = &global_trace;
-	info->cpu = (unsigned long)inode->i_private;
-
-	filp->private_data = info;
-
-	return 0;
-}
-
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
-	struct ftrace_entries_info *info = filp->private_data;
-	struct trace_array *tr = info->tr;
+	struct trace_cpu *tc = filp->private_data;
+	struct trace_array *tr = tc->tr;
 	char buf[64];
 	int r = 0;
 	ssize_t ret;
 
 	mutex_lock(&trace_types_lock);
 
-	if (info->cpu == RING_BUFFER_ALL_CPUS) {
+	if (tc->cpu == RING_BUFFER_ALL_CPUS) {
 		int cpu, buf_size_same;
 		unsigned long size;
 
@@ -3866,7 +3910,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		} else
 			r = sprintf(buf, "X\n");
 	} else
-		r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+		r = sprintf(buf, "%lu\n", tr->data[tc->cpu]->entries >> 10);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -3878,7 +3922,7 @@ static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
-	struct ftrace_entries_info *info = filp->private_data;
+	struct trace_cpu *tc = filp->private_data;
 	unsigned long val;
 	int ret;
 
@@ -3893,7 +3937,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	/* value is in KB */
 	val <<= 10;
 
-	ret = tracing_resize_ring_buffer(val, info->cpu);
+	ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
 	if (ret < 0)
 		return ret;
 
@@ -3902,16 +3946,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static int
-tracing_entries_release(struct inode *inode, struct file *filp)
-{
-	struct ftrace_entries_info *info = filp->private_data;
-
-	kfree(info);
-
-	return 0;
-}
-
 static ssize_t
 tracing_total_entries_read(struct file *filp, char __user *ubuf,
 				size_t cnt, loff_t *ppos)
@@ -3953,11 +3987,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
 static int
 tracing_free_buffer_release(struct inode *inode, struct file *filp)
 {
+	struct trace_array *tr = inode->i_private;
+
 	/* disable tracing ? */
 	if (trace_flags & TRACE_ITER_STOP_ON_FREE)
 		tracing_off();
 	/* resize the ring buffer to 0 */
-	tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
+	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
 
 	return 0;
 }
@@ -4068,13 +4104,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 static int tracing_clock_show(struct seq_file *m, void *v)
 {
+	struct trace_array *tr = m->private;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
 		seq_printf(m,
 			"%s%s%s%s", i ? " " : "",
-			i == trace_clock_id ? "[" : "", trace_clocks[i].name,
-			i == trace_clock_id ? "]" : "");
+			i == tr->clock_id ? "[" : "", trace_clocks[i].name,
+			i == tr->clock_id ? "]" : "");
 	seq_putc(m, '\n');
 
 	return 0;
@@ -4083,6 +4120,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
 static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 				   size_t cnt, loff_t *fpos)
 {
+	struct seq_file *m = filp->private_data;
+	struct trace_array *tr = m->private;
 	char buf[64];
 	const char *clockstr;
 	int i;
@@ -4104,12 +4143,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 	if (i == ARRAY_SIZE(trace_clocks))
 		return -EINVAL;
 
-	trace_clock_id = i;
-
 	mutex_lock(&trace_types_lock);
 
-	ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
-	if (max_tr.buffer)
+	tr->clock_id = i;
+
+	ring_buffer_set_clock(tr->buffer, trace_clocks[i].func);
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && max_tr.buffer)
 		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
 
 	/*
@@ -4130,20 +4169,37 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 {
 	if (tracing_disabled)
 		return -ENODEV;
-	return single_open(file, tracing_clock_show, NULL);
+
+	return single_open(file, tracing_clock_show, inode->i_private);
 }
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 static int tracing_snapshot_open(struct inode *inode, struct file *file)
 {
+	struct trace_cpu *tc = inode->i_private;
 	struct trace_iterator *iter;
+	struct seq_file *m;
 	int ret = 0;
 
 	if (file->f_mode & FMODE_READ) {
 		iter = __tracing_open(inode, file, true);
 		if (IS_ERR(iter))
 			ret = PTR_ERR(iter);
+	} else {
+		/* Writes still need the seq_file to hold the private data */
+		m = kzalloc(sizeof(*m), GFP_KERNEL);
+		if (!m)
+			return -ENOMEM;
+		iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter) {
+			kfree(m);
+			return -ENOMEM;
+		}
+		iter->tr = tc->tr;
+		m->private = iter;
+		file->private_data = m;
 	}
+
 	return ret;
 }
 
@@ -4151,6 +4207,9 @@ static ssize_t
 tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
+	struct seq_file *m = filp->private_data;
+	struct trace_iterator *iter = m->private;
+	struct trace_array *tr = iter->tr;
 	unsigned long val;
 	int ret;
 
@@ -4164,30 +4223,30 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	mutex_lock(&trace_types_lock);
 
-	if (current_trace->use_max_tr) {
+	if (tr->current_trace->use_max_tr) {
 		ret = -EBUSY;
 		goto out;
 	}
 
 	switch (val) {
 	case 0:
-		if (current_trace->allocated_snapshot) {
+		if (tr->current_trace->allocated_snapshot) {
 			/* free spare buffer */
 			ring_buffer_resize(max_tr.buffer, 1,
 					   RING_BUFFER_ALL_CPUS);
 			set_buffer_entries(&max_tr, 1);
 			tracing_reset_online_cpus(&max_tr);
-			current_trace->allocated_snapshot = false;
+			tr->current_trace->allocated_snapshot = false;
 		}
 		break;
 	case 1:
-		if (!current_trace->allocated_snapshot) {
+		if (!tr->current_trace->allocated_snapshot) {
 			/* allocate spare buffer */
 			ret = resize_buffer_duplicate_size(&max_tr,
 					&global_trace, RING_BUFFER_ALL_CPUS);
 			if (ret < 0)
 				break;
-			current_trace->allocated_snapshot = true;
+			tr->current_trace->allocated_snapshot = true;
 		}
 
 		local_irq_disable();
@@ -4196,7 +4255,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		local_irq_enable();
 		break;
 	default:
-		if (current_trace->allocated_snapshot)
+		if (tr->current_trace->allocated_snapshot)
 			tracing_reset_online_cpus(&max_tr);
 		break;
 	}
@@ -4209,6 +4268,22 @@ out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
 }
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	if (file->f_mode & FMODE_READ)
+		return tracing_release(inode, file);
+
+	/* If write only, the seq_file is just a stub */
+	if (m)
+		kfree(m->private);
+	kfree(m);
+
+	return 0;
+}
+
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 
@@ -4236,10 +4311,9 @@ static const struct file_operations tracing_pipe_fops = {
 };
 
 static const struct file_operations tracing_entries_fops = {
-	.open		= tracing_entries_open,
+	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
-	.release	= tracing_entries_release,
 	.llseek		= generic_file_llseek,
 };
 
@@ -4274,7 +4348,7 @@ static const struct file_operations snapshot_fops = {
 	.read		= seq_read,
 	.write		= tracing_snapshot_write,
 	.llseek		= tracing_seek,
-	.release	= tracing_release,
+	.release	= tracing_snapshot_release,
 };
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
@@ -4287,7 +4361,8 @@ struct ftrace_buffer_info {
 
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
-	int cpu = (int)(long)inode->i_private;
+	struct trace_cpu *tc = inode->i_private;
+	struct trace_array *tr = tc->tr;
 	struct ftrace_buffer_info *info;
 
 	if (tracing_disabled)
@@ -4297,8 +4372,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	if (!info)
 		return -ENOMEM;
 
-	info->tr	= &global_trace;
-	info->cpu	= cpu;
+	info->tr	= tr;
+	info->cpu	= tc->cpu;
 	info->spare	= NULL;
 	/* Force reading ring buffer for first read */
 	info->read	= (unsigned int)-1;
@@ -4535,12 +4610,13 @@ static ssize_t
 tracing_stats_read(struct file *filp, char __user *ubuf,
 		   size_t count, loff_t *ppos)
 {
-	unsigned long cpu = (unsigned long)filp->private_data;
-	struct trace_array *tr = &global_trace;
+	struct trace_cpu *tc = filp->private_data;
+	struct trace_array *tr = tc->tr;
 	struct trace_seq *s;
 	unsigned long cnt;
 	unsigned long long t;
 	unsigned long usec_rem;
+	int cpu = tc->cpu;
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
@@ -4636,58 +4712,57 @@ static const struct file_operations tracing_dyn_info_fops = {
 };
 #endif
 
-static struct dentry *d_tracer;
-
-struct dentry *tracing_init_dentry(void)
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
 {
 	static int once;
 
-	if (d_tracer)
-		return d_tracer;
+	if (tr->dir)
+		return tr->dir;
 
 	if (!debugfs_initialized())
 		return NULL;
 
-	d_tracer = debugfs_create_dir("tracing", NULL);
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+		tr->dir = debugfs_create_dir("tracing", NULL);
 
-	if (!d_tracer && !once) {
+	if (!tr->dir && !once) {
 		once = 1;
 		pr_warning("Could not create debugfs directory 'tracing'\n");
 		return NULL;
 	}
 
-	return d_tracer;
+	return tr->dir;
 }
 
-static struct dentry *d_percpu;
+struct dentry *tracing_init_dentry(void)
+{
+	return tracing_init_dentry_tr(&global_trace);
+}
 
-static struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 {
-	static int once;
 	struct dentry *d_tracer;
 
-	if (d_percpu)
-		return d_percpu;
-
-	d_tracer = tracing_init_dentry();
+	if (tr->percpu_dir)
+		return tr->percpu_dir;
 
+	d_tracer = tracing_init_dentry_tr(tr);
 	if (!d_tracer)
 		return NULL;
 
-	d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
 
-	if (!d_percpu && !once) {
-		once = 1;
-		pr_warning("Could not create debugfs directory 'per_cpu'\n");
-		return NULL;
-	}
+	WARN_ONCE(!tr->percpu_dir,
+		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
 
-	return d_percpu;
+	return tr->percpu_dir;
 }
 
-static void tracing_init_debugfs_percpu(long cpu)
+static void
+tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-	struct dentry *d_percpu = tracing_dentry_percpu();
+	struct trace_array_cpu *data = tr->data[cpu];
+	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
 	struct dentry *d_cpu;
 	char cpu_dir[30]; /* 30 characters should be more than enough */
 
@@ -4703,20 +4778,20 @@ static void tracing_init_debugfs_percpu(long cpu)
 
 	/* per cpu trace_pipe */
 	trace_create_file("trace_pipe", 0444, d_cpu,
-			(void *) cpu, &tracing_pipe_fops);
+			(void *)&data->trace_cpu, &tracing_pipe_fops);
 
 	/* per cpu trace */
 	trace_create_file("trace", 0644, d_cpu,
-			(void *) cpu, &tracing_fops);
+			(void *)&data->trace_cpu, &tracing_fops);
 
 	trace_create_file("trace_pipe_raw", 0444, d_cpu,
-			(void *) cpu, &tracing_buffers_fops);
+			(void *)&data->trace_cpu, &tracing_buffers_fops);
 
 	trace_create_file("stats", 0444, d_cpu,
-			(void *) cpu, &tracing_stats_fops);
+			(void *)&data->trace_cpu, &tracing_stats_fops);
 
 	trace_create_file("buffer_size_kb", 0444, d_cpu,
-			(void *) cpu, &tracing_entries_fops);
+			(void *)&data->trace_cpu, &tracing_entries_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -4727,6 +4802,7 @@ static void tracing_init_debugfs_percpu(long cpu)
 struct trace_option_dentry {
 	struct tracer_opt		*opt;
 	struct tracer_flags		*flags;
+	struct trace_array		*tr;
 	struct dentry			*entry;
 };
 
@@ -4762,7 +4838,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (!!(topt->flags->val & topt->opt->bit) != val) {
 		mutex_lock(&trace_types_lock);
-		ret = __set_tracer_option(current_trace, topt->flags,
+		ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
 					  topt->opt, !val);
 		mutex_unlock(&trace_types_lock);
 		if (ret)
@@ -4801,6 +4877,7 @@ static ssize_t
 trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 			 loff_t *ppos)
 {
+	struct trace_array *tr = &global_trace;
 	long index = (long)filp->private_data;
 	unsigned long val;
 	int ret;
@@ -4813,7 +4890,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return -EINVAL;
 
 	mutex_lock(&trace_types_lock);
-	ret = set_tracer_flag(1 << index, val);
+	ret = set_tracer_flag(tr, 1 << index, val);
 	mutex_unlock(&trace_types_lock);
 
 	if (ret < 0)
@@ -4847,40 +4924,41 @@ struct dentry *trace_create_file(const char *name,
 }
 
 
-static struct dentry *trace_options_init_dentry(void)
+static struct dentry *trace_options_init_dentry(struct trace_array *tr)
 {
 	struct dentry *d_tracer;
-	static struct dentry *t_options;
 
-	if (t_options)
-		return t_options;
+	if (tr->options)
+		return tr->options;
 
-	d_tracer = tracing_init_dentry();
+	d_tracer = tracing_init_dentry_tr(tr);
 	if (!d_tracer)
 		return NULL;
 
-	t_options = debugfs_create_dir("options", d_tracer);
-	if (!t_options) {
+	tr->options = debugfs_create_dir("options", d_tracer);
+	if (!tr->options) {
 		pr_warning("Could not create debugfs directory 'options'\n");
 		return NULL;
 	}
 
-	return t_options;
+	return tr->options;
 }
 
 static void
-create_trace_option_file(struct trace_option_dentry *topt,
+create_trace_option_file(struct trace_array *tr,
+			 struct trace_option_dentry *topt,
 			 struct tracer_flags *flags,
 			 struct tracer_opt *opt)
 {
 	struct dentry *t_options;
 
-	t_options = trace_options_init_dentry();
+	t_options = trace_options_init_dentry(tr);
 	if (!t_options)
 		return;
 
 	topt->flags = flags;
 	topt->opt = opt;
+	topt->tr = tr;
 
 	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
 				    &trace_options_fops);
@@ -4888,7 +4966,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
 }
 
 static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer)
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
 {
 	struct trace_option_dentry *topts;
 	struct tracer_flags *flags;
@@ -4913,7 +4991,7 @@ create_trace_option_files(struct tracer *tracer)
 		return NULL;
 
 	for (cnt = 0; opts[cnt].name; cnt++)
-		create_trace_option_file(&topts[cnt], flags,
+		create_trace_option_file(tr, &topts[cnt], flags,
 					 &opts[cnt]);
 
 	return topts;
@@ -4936,11 +5014,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
 }
 
 static struct dentry *
-create_trace_option_core_file(const char *option, long index)
+create_trace_option_core_file(struct trace_array *tr,
+			      const char *option, long index)
 {
 	struct dentry *t_options;
 
-	t_options = trace_options_init_dentry();
+	t_options = trace_options_init_dentry(tr);
 	if (!t_options)
 		return NULL;
 
@@ -4948,17 +5027,17 @@ create_trace_option_core_file(const char *option, long index)
 				    &trace_options_core_fops);
 }
 
-static __init void create_trace_options_dir(void)
+static __init void create_trace_options_dir(struct trace_array *tr)
 {
 	struct dentry *t_options;
 	int i;
 
-	t_options = trace_options_init_dentry();
+	t_options = trace_options_init_dentry(tr);
 	if (!t_options)
 		return;
 
 	for (i = 0; trace_options[i]; i++)
-		create_trace_option_core_file(trace_options[i], i);
+		create_trace_option_core_file(tr, trace_options[i], i);
 }
 
 static ssize_t
@@ -4997,12 +5076,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 		mutex_lock(&trace_types_lock);
 		if (val) {
 			ring_buffer_record_on(buffer);
-			if (current_trace->start)
-				current_trace->start(tr);
+			if (tr->current_trace->start)
+				tr->current_trace->start(tr);
 		} else {
 			ring_buffer_record_off(buffer);
-			if (current_trace->stop)
-				current_trace->stop(tr);
+			if (tr->current_trace->stop)
+				tr->current_trace->stop(tr);
 		}
 		mutex_unlock(&trace_types_lock);
 	}
@@ -5019,6 +5098,38 @@ static const struct file_operations rb_simple_fops = {
 	.llseek		= default_llseek,
 };
 
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+{
+
+	trace_create_file("trace_options", 0644, d_tracer,
+			  tr, &tracing_iter_fops);
+
+	trace_create_file("trace", 0644, d_tracer,
+			(void *)&tr->trace_cpu, &tracing_fops);
+
+	trace_create_file("trace_pipe", 0444, d_tracer,
+			(void *)&tr->trace_cpu, &tracing_pipe_fops);
+
+	trace_create_file("buffer_size_kb", 0644, d_tracer,
+			(void *)&tr->trace_cpu, &tracing_entries_fops);
+
+	trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+			  tr, &tracing_total_entries_fops);
+
+	trace_create_file("free_buffer", 0644, d_tracer,
+			  tr, &tracing_free_buffer_fops);
+
+	trace_create_file("trace_marker", 0220, d_tracer,
+			  tr, &tracing_mark_fops);
+
+	trace_create_file("trace_clock", 0644, d_tracer, tr,
+			  &trace_clock_fops);
+
+	trace_create_file("tracing_on", 0644, d_tracer,
+			    tr, &rb_simple_fops);
+}
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -5028,14 +5139,10 @@ static __init int tracer_init_debugfs(void)
 
 	d_tracer = tracing_init_dentry();
 
-	trace_create_file("trace_options", 0644, d_tracer,
-			NULL, &tracing_iter_fops);
+	init_tracer_debugfs(&global_trace, d_tracer);
 
 	trace_create_file("tracing_cpumask", 0644, d_tracer,
-			NULL, &tracing_cpumask_fops);
-
-	trace_create_file("trace", 0644, d_tracer,
-			(void *) RING_BUFFER_ALL_CPUS, &tracing_fops);
+			&global_trace, &tracing_cpumask_fops);
 
 	trace_create_file("available_tracers", 0444, d_tracer,
 			&global_trace, &show_traces_fops);
@@ -5054,30 +5161,9 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("README", 0444, d_tracer,
 			NULL, &tracing_readme_fops);
 
-	trace_create_file("trace_pipe", 0444, d_tracer,
-			(void *) RING_BUFFER_ALL_CPUS, &tracing_pipe_fops);
-
-	trace_create_file("buffer_size_kb", 0644, d_tracer,
-			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
-
-	trace_create_file("buffer_total_size_kb", 0444, d_tracer,
-			&global_trace, &tracing_total_entries_fops);
-
-	trace_create_file("free_buffer", 0644, d_tracer,
-			&global_trace, &tracing_free_buffer_fops);
-
-	trace_create_file("trace_marker", 0220, d_tracer,
-			NULL, &tracing_mark_fops);
-
 	trace_create_file("saved_cmdlines", 0444, d_tracer,
 			NULL, &tracing_saved_cmdlines_fops);
 
-	trace_create_file("trace_clock", 0644, d_tracer, NULL,
-			  &trace_clock_fops);
-
-	trace_create_file("tracing_on", 0644, d_tracer,
-			    &global_trace, &rb_simple_fops);
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -5085,13 +5171,13 @@ static __init int tracer_init_debugfs(void)
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 	trace_create_file("snapshot", 0644, d_tracer,
-			  (void *) RING_BUFFER_ALL_CPUS, &snapshot_fops);
+			  (void *)&global_trace.trace_cpu, &snapshot_fops);
 #endif
 
-	create_trace_options_dir();
+	create_trace_options_dir(&global_trace);
 
 	for_each_tracing_cpu(cpu)
-		tracing_init_debugfs_percpu(cpu);
+		tracing_init_debugfs_percpu(&global_trace, cpu);
 
 	return 0;
 }
@@ -5161,7 +5247,7 @@ trace_printk_seq(struct trace_seq *s)
 void trace_init_global_iter(struct trace_iterator *iter)
 {
 	iter->tr = &global_trace;
-	iter->trace = current_trace;
+	iter->trace = iter->tr->current_trace;
 	iter->cpu_file = RING_BUFFER_ALL_CPUS;
 }
 
@@ -5315,6 +5401,8 @@ __init static int tracer_alloc_buffers(void)
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 
+	raw_spin_lock_init(&global_trace.start_lock);
+
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
 	if (!global_trace.buffer) {
@@ -5328,6 +5416,7 @@ __init static int tracer_alloc_buffers(void)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
+	raw_spin_lock_init(&max_tr.start_lock);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
@@ -5339,7 +5428,11 @@ __init static int tracer_alloc_buffers(void)
 	/* Allocate the first page for all buffers */
 	for_each_tracing_cpu(i) {
 		global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		global_trace.data[i]->trace_cpu.cpu = i;
+		global_trace.data[i]->trace_cpu.tr = &global_trace;
 		max_tr.data[i] = &per_cpu(max_tr_data, i);
+		max_tr.data[i]->trace_cpu.cpu = i;
+		max_tr.data[i]->trace_cpu.tr = &max_tr;
 	}
 
 	set_buffer_entries(&global_trace,
@@ -5353,6 +5446,8 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 
+	global_trace.current_trace = &nop_trace;
+
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
@@ -5363,6 +5458,10 @@ __init static int tracer_alloc_buffers(void)
 
 	global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
 
+	/* Holder for file callbacks */
+	global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+	global_trace.trace_cpu.tr = &global_trace;
+
 	INIT_LIST_HEAD(&global_trace.systems);
 	INIT_LIST_HEAD(&global_trace.events);
 	list_add(&global_trace.list, &ftrace_trace_arrays);
@@ -5371,7 +5470,7 @@ __init static int tracer_alloc_buffers(void)
 		char *option;
 
 		option = strsep(&trace_boot_options, ",");
-		trace_set_options(option);
+		trace_set_options(&global_trace, option);
 	}
 
 	return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index da09a037abcd..b80fbcf70af4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -127,12 +127,21 @@ enum trace_flag_type {
 
 #define TRACE_BUF_SIZE		1024
 
+struct trace_array;
+
+struct trace_cpu {
+	struct trace_array	*tr;
+	struct dentry		*dir;
+	int			cpu;
+};
+
 /*
  * The CPU trace array - it consists of thousands of trace entries
  * plus some other descriptor data: (for example which task started
  * the trace, etc.)
  */
 struct trace_array_cpu {
+	struct trace_cpu	trace_cpu;
 	atomic_t		disabled;
 	void			*buffer_page;	/* ring buffer spare */
 
@@ -151,6 +160,8 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 };
 
+struct tracer;
+
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -161,9 +172,16 @@ struct trace_array {
 	struct list_head	list;
 	int			cpu;
 	int			buffer_disabled;
+	struct trace_cpu	trace_cpu;	/* place holder */
+	int			stop_count;
+	int			clock_id;
+	struct tracer		*current_trace;
 	unsigned int		flags;
 	cycle_t			time_start;
+	raw_spinlock_t		start_lock;
 	struct dentry		*dir;
+	struct dentry		*options;
+	struct dentry		*percpu_dir;
 	struct dentry		*event_dir;
 	struct list_head	systems;
 	struct list_head	events;
@@ -474,6 +492,7 @@ struct dentry *trace_create_file(const char *name,
 				 void *data,
 				 const struct file_operations *fops);
 
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
 struct dentry *tracing_init_dentry(void);
 
 struct ring_buffer_event;
@@ -979,7 +998,7 @@ extern const char *__stop___trace_bprintk_fmt[];
 void trace_printk_init_buffers(void);
 void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
-int set_tracer_flag(unsigned int mask, int enabled);
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 443b25b43b4f..b3cf6bf308ef 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -561,8 +561,8 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	save_flags = trace_flags;
 
 	/* non overwrite screws up the latency tracers */
-	set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
-	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
+	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
 
 	tracing_max_latency = 0;
 	irqsoff_trace = tr;
@@ -581,8 +581,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 
 	stop_irqsoff_tracer(tr, is_graph());
 
-	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
-	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
+	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fde652c9a511..5255a8477247 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -543,8 +543,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)
 	save_flags = trace_flags;
 
 	/* non overwrite screws up the latency tracers */
-	set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
-	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
+	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
 
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
@@ -573,8 +573,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	/* make sure we put back any tasks we are tracing */
 	wakeup_reset(tr);
 
-	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
-	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
+	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
-- 
cgit v1.2.3


From ccb469a198cffac94a7eea0b69f715f06e2ddf15 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Aug 2012 10:32:10 -0400
Subject: tracing: Pass the ftrace_file to the buffer lock reserve code

Pass the struct ftrace_event_file *ftrace_file to the
trace_event_buffer_lock_reserve() (new function that replaces the
trace_current_buffer_lock_reserver()).

The ftrace_file holds a pointer to the trace_array that is in use.
In the case of multiple buffers with different trace_arrays, this
allows different events to be recorded into different buffers.

Also fixed some of the stale comments in include/trace/ftrace.h

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91fe40905828..29bff72f97ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1293,6 +1293,18 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+			  struct ftrace_event_file *ftrace_file,
+			  int type, unsigned long len,
+			  unsigned long flags, int pc)
+{
+	*current_rb = ftrace_file->tr->buffer;
+	return trace_buffer_lock_reserve(*current_rb,
+					 type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 				  int type, unsigned long len,
-- 
cgit v1.2.3


From a7603ff4b5f7e26e67af82a4c3d05eeeb8d7b160 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 6 Aug 2012 16:24:11 -0400
Subject: tracing: Replace the static global per_cpu arrays with allocated
 per_cpu

The global and max-tr currently use static per_cpu arrays for the CPU data
descriptors. But in order to get new allocated trace_arrays, they need to
be allocated per_cpu arrays. Instead of using the static arrays, switch
the global and max-tr to use allocated data.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c                 | 92 +++++++++++++++++++++---------------
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_branch.c          |  6 ++-
 kernel/trace/trace_functions.c       |  4 +-
 kernel/trace/trace_functions_graph.c |  4 +-
 kernel/trace/trace_irqsoff.c         |  6 +--
 kernel/trace/trace_kdb.c             |  4 +-
 kernel/trace/trace_mmiotrace.c       |  4 +-
 kernel/trace/trace_sched_switch.c    |  4 +-
 kernel/trace/trace_sched_wakeup.c    | 14 +++---
 10 files changed, 79 insertions(+), 61 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 29bff72f97ef..406adbc277a0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -191,8 +191,6 @@ static struct trace_array	global_trace;
 
 LIST_HEAD(ftrace_trace_arrays);
 
-static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
-
 int filter_current_check_discard(struct ring_buffer *buffer,
 				 struct ftrace_event_call *call, void *rec,
 				 struct ring_buffer_event *event)
@@ -227,8 +225,6 @@ cycle_t ftrace_now(int cpu)
  */
 static struct trace_array	max_tr;
 
-static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
-
 int tracing_is_enabled(void)
 {
 	return tracing_is_on();
@@ -666,13 +662,13 @@ unsigned long __read_mostly	tracing_max_latency;
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-	struct trace_array_cpu *data = tr->data[cpu];
+	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
 	struct trace_array_cpu *max_data;
 
 	max_tr.cpu = cpu;
 	max_tr.time_start = data->preempt_timestamp;
 
-	max_data = max_tr.data[cpu];
+	max_data = per_cpu_ptr(max_tr.data, cpu);
 	max_data->saved_latency = tracing_max_latency;
 	max_data->critical_start = data->critical_start;
 	max_data->critical_end = data->critical_end;
@@ -1984,7 +1980,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 	unsigned long entries = 0;
 	u64 ts;
 
-	tr->data[cpu]->skipped_entries = 0;
+	per_cpu_ptr(tr->data, cpu)->skipped_entries = 0;
 
 	buf_iter = trace_buffer_iter(iter, cpu);
 	if (!buf_iter)
@@ -2004,7 +2000,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 		ring_buffer_read(buf_iter, NULL);
 	}
 
-	tr->data[cpu]->skipped_entries = entries;
+	per_cpu_ptr(tr->data, cpu)->skipped_entries = entries;
 }
 
 /*
@@ -2099,8 +2095,8 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
 		 * entries for the trace and we need to ignore the
 		 * ones before the time stamp.
 		 */
-		if (tr->data[cpu]->skipped_entries) {
-			count -= tr->data[cpu]->skipped_entries;
+		if (per_cpu_ptr(tr->data, cpu)->skipped_entries) {
+			count -= per_cpu_ptr(tr->data, cpu)->skipped_entries;
 			/* total is the same as the entries */
 			*total += count;
 		} else
@@ -2157,7 +2153,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_array *tr = iter->tr;
-	struct trace_array_cpu *data = tr->data[tr->cpu];
+	struct trace_array_cpu *data = per_cpu_ptr(tr->data, tr->cpu);
 	struct tracer *type = iter->trace;
 	unsigned long entries;
 	unsigned long total;
@@ -2227,7 +2223,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	if (cpumask_test_cpu(iter->cpu, iter->started))
 		return;
 
-	if (iter->tr->data[iter->cpu]->skipped_entries)
+	if (per_cpu_ptr(iter->tr->data, iter->cpu)->skipped_entries)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
@@ -2858,12 +2854,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		 */
 		if (cpumask_test_cpu(cpu, tracing_cpumask) &&
 				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_inc(&tr->data[cpu]->disabled);
+			atomic_inc(&per_cpu_ptr(tr->data, cpu)->disabled);
 			ring_buffer_record_disable_cpu(tr->buffer, cpu);
 		}
 		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
 				cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_dec(&tr->data[cpu]->disabled);
+			atomic_dec(&per_cpu_ptr(tr->data, cpu)->disabled);
 			ring_buffer_record_enable_cpu(tr->buffer, cpu);
 		}
 	}
@@ -3177,7 +3173,7 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
 {
 	int cpu;
 	for_each_tracing_cpu(cpu)
-		tr->data[cpu]->entries = val;
+		per_cpu_ptr(tr->data, cpu)->entries = val;
 }
 
 /* resize @tr's buffer to the size of @size_tr's entries */
@@ -3189,17 +3185,18 @@ static int resize_buffer_duplicate_size(struct trace_array *tr,
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			ret = ring_buffer_resize(tr->buffer,
-					size_tr->data[cpu]->entries, cpu);
+				 per_cpu_ptr(size_tr->data, cpu)->entries, cpu);
 			if (ret < 0)
 				break;
-			tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+			per_cpu_ptr(tr->data, cpu)->entries =
+				per_cpu_ptr(size_tr->data, cpu)->entries;
 		}
 	} else {
 		ret = ring_buffer_resize(tr->buffer,
-					size_tr->data[cpu_id]->entries, cpu_id);
+				 per_cpu_ptr(size_tr->data, cpu_id)->entries, cpu_id);
 		if (ret == 0)
-			tr->data[cpu_id]->entries =
-				size_tr->data[cpu_id]->entries;
+			per_cpu_ptr(tr->data, cpu_id)->entries =
+				per_cpu_ptr(size_tr->data, cpu_id)->entries;
 	}
 
 	return ret;
@@ -3256,13 +3253,13 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	if (cpu == RING_BUFFER_ALL_CPUS)
 		set_buffer_entries(&max_tr, size);
 	else
-		max_tr.data[cpu]->entries = size;
+		per_cpu_ptr(max_tr.data, cpu)->entries = size;
 
  out:
 	if (cpu == RING_BUFFER_ALL_CPUS)
 		set_buffer_entries(tr, size);
 	else
-		tr->data[cpu]->entries = size;
+		per_cpu_ptr(tr->data, cpu)->entries = size;
 
 	return ret;
 }
@@ -3905,8 +3902,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		for_each_tracing_cpu(cpu) {
 			/* fill in the size from first enabled cpu */
 			if (size == 0)
-				size = tr->data[cpu]->entries;
-			if (size != tr->data[cpu]->entries) {
+				size = per_cpu_ptr(tr->data, cpu)->entries;
+			if (size != per_cpu_ptr(tr->data, cpu)->entries) {
 				buf_size_same = 0;
 				break;
 			}
@@ -3922,7 +3919,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		} else
 			r = sprintf(buf, "X\n");
 	} else
-		r = sprintf(buf, "%lu\n", tr->data[tc->cpu]->entries >> 10);
+		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->data, tc->cpu)->entries >> 10);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -3969,7 +3966,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
-		size += tr->data[cpu]->entries >> 10;
+		size += per_cpu_ptr(tr->data, cpu)->entries >> 10;
 		if (!ring_buffer_expanded)
 			expanded_size += trace_buf_size >> 10;
 	}
@@ -4773,7 +4770,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 static void
 tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-	struct trace_array_cpu *data = tr->data[cpu];
+	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
 	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
 	struct dentry *d_cpu;
 	char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5298,7 +5295,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&iter.tr->data[cpu]->disabled);
+		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5366,7 +5363,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		trace_flags |= old_userobj;
 
 		for_each_tracing_cpu(cpu) {
-			atomic_dec(&iter.tr->data[cpu]->disabled);
+			atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 		}
 		tracing_on();
 	}
@@ -5422,11 +5419,31 @@ __init static int tracer_alloc_buffers(void)
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
+
+	global_trace.data = alloc_percpu(struct trace_array_cpu);
+
+	if (!global_trace.data) {
+		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
+		WARN_ON(1);
+		goto out_free_cpumask;
+	}
+
+	for_each_tracing_cpu(i) {
+		memset(per_cpu_ptr(global_trace.data, i), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(global_trace.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(global_trace.data, i)->trace_cpu.tr = &global_trace;
+	}
+
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
-
 #ifdef CONFIG_TRACER_MAX_TRACE
+	max_tr.data = alloc_percpu(struct trace_array_cpu);
+	if (!max_tr.data) {
+		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
+		WARN_ON(1);
+		goto out_free_cpumask;
+	}
 	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
 	raw_spin_lock_init(&max_tr.start_lock);
 	if (!max_tr.buffer) {
@@ -5435,18 +5452,15 @@ __init static int tracer_alloc_buffers(void)
 		ring_buffer_free(global_trace.buffer);
 		goto out_free_cpumask;
 	}
-#endif
 
-	/* Allocate the first page for all buffers */
 	for_each_tracing_cpu(i) {
-		global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-		global_trace.data[i]->trace_cpu.cpu = i;
-		global_trace.data[i]->trace_cpu.tr = &global_trace;
-		max_tr.data[i] = &per_cpu(max_tr_data, i);
-		max_tr.data[i]->trace_cpu.cpu = i;
-		max_tr.data[i]->trace_cpu.tr = &max_tr;
+		memset(per_cpu_ptr(max_tr.data, i), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(max_tr.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(max_tr.data, i)->trace_cpu.tr = &max_tr;
 	}
+#endif
 
+	/* Allocate the first page for all buffers */
 	set_buffer_entries(&global_trace,
 			   ring_buffer_size(global_trace.buffer, 0));
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -5488,6 +5502,8 @@ __init static int tracer_alloc_buffers(void)
 	return 0;
 
 out_free_cpumask:
+	free_percpu(global_trace.data);
+	free_percpu(max_tr.data);
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b80fbcf70af4..15ccd7cd1560 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -186,7 +186,7 @@ struct trace_array {
 	struct list_head	systems;
 	struct list_head	events;
 	struct task_struct	*waiter;
-	struct trace_array_cpu	*data[NR_CPUS];
+	struct trace_array_cpu	*data;
 };
 
 enum {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..6dadbefbb1d6 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
 	struct ftrace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
+	struct trace_array_cpu *data;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
 	struct ring_buffer *buffer;
@@ -51,7 +52,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+	data = per_cpu_ptr(tr->data, cpu);
+	if (atomic_inc_return(&data->disabled) != 1)
 		goto out;
 
 	pc = preempt_count();
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 		__buffer_unlock_commit(buffer, event);
 
  out:
-	atomic_dec(&tr->data[cpu]->disabled);
+	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..9d73861efc6a 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 		goto out;
 
 	cpu = smp_processor_id();
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 	if (!atomic_read(&data->disabled)) {
 		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	 */
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1)) {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..ca986d61a282 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b3cf6bf308ef..9b52f9cf7a0d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -121,7 +121,7 @@ static int func_prolog_dec(struct trace_array *tr,
 	if (!irqs_disabled_flags(*flags))
 		return 0;
 
-	*data = tr->data[cpu];
+	*data = per_cpu_ptr(tr->data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 
 	if (likely(disabled == 1))
@@ -380,7 +380,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (per_cpu(tracing_cpu, cpu))
 		return;
 
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 
 	if (unlikely(!data) || atomic_read(&data->disabled))
 		return;
@@ -418,7 +418,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (!tracer_enabled)
 		return;
 
-	data = tr->data[cpu];
+	data = per_cpu_ptr(tr->data, cpu);
 
 	if (unlikely(!data) ||
 	    !data->critical_start || atomic_read(&data->disabled))
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index cc1dbdc5ee5d..349f6941e8f2 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&iter.tr->data[cpu]->disabled);
+		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags;
@@ -83,7 +83,7 @@ out:
 	trace_flags = old_userobj;
 
 	for_each_tracing_cpu(cpu) {
-		atomic_dec(&iter.tr->data[cpu]->disabled);
+		atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
 	}
 
 	for_each_tracing_cpu(cpu)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..2472f6f76b50 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
-	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	struct trace_array_cpu *data = per_cpu_ptr(tr->data, smp_processor_id());
 	__trace_mmiotrace_rw(tr, data, rw);
 }
 
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 	struct trace_array_cpu *data;
 
 	preempt_disable();
-	data = tr->data[smp_processor_id()];
+	data = per_cpu_ptr(tr->data, smp_processor_id());
 	__trace_mmiotrace_map(tr, data, map);
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..1ffe39abd6fc 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = ctx_trace->data[cpu];
+	data = per_cpu_ptr(ctx_trace->data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = ctx_trace->data[cpu];
+	data = per_cpu_ptr(ctx_trace->data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5255a8477247..f9ceb75a95b7 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -89,7 +89,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
 	if (cpu != wakeup_current_cpu)
 		goto out_enable;
 
-	*data = tr->data[cpu];
+	*data = per_cpu_ptr(tr->data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
@@ -353,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore,
 
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
-	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
 	if (likely(disabled != 1))
 		goto out;
 
@@ -365,7 +365,7 @@ probe_wakeup_sched_switch(void *ignore,
 		goto out_unlock;
 
 	/* The task we are waiting for is waking up */
-	data = wakeup_trace->data[wakeup_cpu];
+	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
 
 	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +387,7 @@ out_unlock:
 	arch_spin_unlock(&wakeup_lock);
 	local_irq_restore(flags);
 out:
-	atomic_dec(&wakeup_trace->data[cpu]->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
 }
 
 static void __wakeup_reset(struct trace_array *tr)
@@ -435,7 +435,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 		return;
 
 	pc = preempt_count();
-	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
@@ -458,7 +458,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	data = wakeup_trace->data[wakeup_cpu];
+	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
 
@@ -472,7 +472,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
 out:
-	atomic_dec(&wakeup_trace->data[cpu]->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
-- 
cgit v1.2.3


From 12ab74ee00d154bc05ea2fc659b7ce6519e5d5a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 8 Aug 2012 14:48:20 -0400
Subject: tracing: Make syscall events suitable for multiple buffers

Currently the syscall events record into the global buffer. But if
multiple buffers are in place, then we need to have syscall events
record in the proper buffers.

By adding descriptors to pass to the syscall event functions, the
syscall events can now record into the buffers that have been assigned
to them (one event may be applied to mulitple buffers).

This will allow tracing high volume syscalls along with seldom occurring
syscalls without losing the seldom syscall events.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h          | 11 ++++++
 kernel/trace/trace_syscalls.c | 80 +++++++++++++++++++++++++------------------
 2 files changed, 57 insertions(+), 34 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 15ccd7cd1560..68cad7a9e089 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#include <asm/unistd.h>		/* For NR_SYSCALLS	     */
+#include <asm/syscall.h>	/* some archs define it here */
+#endif
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -173,6 +178,12 @@ struct trace_array {
 	int			cpu;
 	int			buffer_disabled;
 	struct trace_cpu	trace_cpu;	/* place holder */
+#ifdef CONFIG_FTRACE_SYSCALLS
+	int			sys_refcount_enter;
+	int			sys_refcount_exit;
+	DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+	DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+#endif
 	int			stop_count;
 	int			clock_id;
 	struct tracer		*current_trace;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..a842783ad6be 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
 #include "trace.h"
 
 static DEFINE_MUTEX(syscall_trace_lock);
-static int sys_refcount_enter;
-static int sys_refcount_exit;
-static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
 static int syscall_enter_register(struct ftrace_event_call *event,
 				 enum trace_reg type, void *data);
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
+	struct trace_array *tr = data;
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0)
 		return;
-	if (!test_bit(syscall_nr, enabled_enter_syscalls))
+	if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	event = trace_current_buffer_lock_reserve(&buffer,
+	buffer = tr->buffer;
+	event = trace_buffer_lock_reserve(buffer,
 			sys_data->enter_event->event.type, size, 0, 0);
 	if (!event)
 		return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 {
+	struct trace_array *tr = data;
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0)
 		return;
-	if (!test_bit(syscall_nr, enabled_exit_syscalls))
+	if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
 
-	event = trace_current_buffer_lock_reserve(&buffer,
+	buffer = tr->buffer;
+	event = trace_buffer_lock_reserve(buffer,
 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 	if (!event)
 		return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-static int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_file *file,
+				   struct ftrace_event_call *call)
 {
+	struct trace_array *tr = file->tr;
 	int ret = 0;
 	int num;
 
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_refcount_enter)
-		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
+	if (!tr->sys_refcount_enter)
+		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
 	if (!ret) {
-		set_bit(num, enabled_enter_syscalls);
-		sys_refcount_enter++;
+		set_bit(num, tr->enabled_enter_syscalls);
+		tr->sys_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-static void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+				      struct ftrace_event_call *call)
 {
+	struct trace_array *tr = file->tr;
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
-	sys_refcount_enter--;
-	clear_bit(num, enabled_enter_syscalls);
-	if (!sys_refcount_enter)
-		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
+	tr->sys_refcount_enter--;
+	clear_bit(num, tr->enabled_enter_syscalls);
+	if (!tr->sys_refcount_enter)
+		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_file *file,
+				  struct ftrace_event_call *call)
 {
+	struct trace_array *tr = file->tr;
 	int ret = 0;
 	int num;
 
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_refcount_exit)
-		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
+	if (!tr->sys_refcount_exit)
+		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
 	if (!ret) {
-		set_bit(num, enabled_exit_syscalls);
-		sys_refcount_exit++;
+		set_bit(num, tr->enabled_exit_syscalls);
+		tr->sys_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-static void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+				     struct ftrace_event_call *call)
 {
+	struct trace_array *tr = file->tr;
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
-	sys_refcount_exit--;
-	clear_bit(num, enabled_exit_syscalls);
-	if (!sys_refcount_exit)
-		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
+	tr->sys_refcount_exit--;
+	clear_bit(num, tr->enabled_exit_syscalls);
+	if (!tr->sys_refcount_exit)
+		unregister_trace_sys_exit(ftrace_syscall_exit, tr);
 	mutex_unlock(&syscall_trace_lock);
 }
 
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 static int syscall_enter_register(struct ftrace_event_call *event,
 				 enum trace_reg type, void *data)
 {
+	struct ftrace_event_file *file = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return reg_event_syscall_enter(event);
+		return reg_event_syscall_enter(file, event);
 	case TRACE_REG_UNREGISTER:
-		unreg_event_syscall_enter(event);
+		unreg_event_syscall_enter(file, event);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 static int syscall_exit_register(struct ftrace_event_call *event,
 				 enum trace_reg type, void *data)
 {
+	struct ftrace_event_file *file = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return reg_event_syscall_exit(event);
+		return reg_event_syscall_exit(file, event);
 	case TRACE_REG_UNREGISTER:
-		unreg_event_syscall_exit(event);
+		unreg_event_syscall_exit(file, event);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From 277ba04461c2746cf935353474c0961161951b68 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Aug 2012 16:10:49 -0400
Subject: tracing: Add interface to allow multiple trace buffers

Add the interface ("instances" directory) to add multiple buffers
to ftrace. To create a new instance, simply do a mkdir in the
instances directory:

This will create a directory with the following:

 # cd instances
 # mkdir foo
 # ls foo
buffer_size_kb        free_buffer  trace_clock    trace_pipe
buffer_total_size_kb  set_event    trace_marker   tracing_enabled
events/               trace        trace_options  tracing_on

Currently only events are able to be set, and there isn't a way
to delete a buffer when one is created (yet).

Note, the i_mutex lock is dropped from the parent "instances"
directory during the mkdir operation. As the "instances" directory
can not be renamed or deleted (created on boot), I do not see
any harm in dropping the lock. The creation of the sub directories
is protected by trace_types_lock mutex, which only lets one
instance get into the code path at a time. If two tasks try to
create or delete directories of the same name, only one will occur
and the other will fail with -EEXIST.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 129 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |   2 +
 kernel/trace/trace_events.c |  12 ++++-
 3 files changed, 142 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 406adbc277a0..07a63114d938 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5107,6 +5107,133 @@ static const struct file_operations rb_simple_fops = {
 	.llseek		= default_llseek,
 };
 
+struct dentry *trace_instance_dir;
+
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+
+static int new_instance_create(const char *name)
+{
+	enum ring_buffer_flags rb_flags;
+	struct trace_array *tr;
+	int ret;
+	int i;
+
+	mutex_lock(&trace_types_lock);
+
+	ret = -EEXIST;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr->name && strcmp(tr->name, name) == 0)
+			goto out_unlock;
+	}
+
+	ret = -ENOMEM;
+	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+	if (!tr)
+		goto out_unlock;
+
+	tr->name = kstrdup(name, GFP_KERNEL);
+	if (!tr->name)
+		goto out_free_tr;
+
+	raw_spin_lock_init(&tr->start_lock);
+
+	tr->current_trace = &nop_trace;
+
+	INIT_LIST_HEAD(&tr->systems);
+	INIT_LIST_HEAD(&tr->events);
+
+	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
+	tr->buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
+	if (!tr->buffer)
+		goto out_free_tr;
+
+	tr->data = alloc_percpu(struct trace_array_cpu);
+	if (!tr->data)
+		goto out_free_tr;
+
+	for_each_tracing_cpu(i) {
+		memset(per_cpu_ptr(tr->data, i), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(tr->data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(tr->data, i)->trace_cpu.tr = tr;
+	}
+
+	/* Holder for file callbacks */
+	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+	tr->trace_cpu.tr = tr;
+
+	tr->dir = debugfs_create_dir(name, trace_instance_dir);
+	if (!tr->dir)
+		goto out_free_tr;
+
+	ret = event_trace_add_tracer(tr->dir, tr);
+	if (ret)
+		goto out_free_tr;
+
+	init_tracer_debugfs(tr, tr->dir);
+
+	list_add(&tr->list, &ftrace_trace_arrays);
+
+	mutex_unlock(&trace_types_lock);
+
+	return 0;
+
+ out_free_tr:
+	if (tr->buffer)
+		ring_buffer_free(tr->buffer);
+	kfree(tr->name);
+	kfree(tr);
+
+ out_unlock:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+
+}
+
+static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+	struct dentry *parent;
+	int ret;
+
+	/* Paranoid: Make sure the parent is the "instances" directory */
+	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+	if (WARN_ON_ONCE(parent != trace_instance_dir))
+		return -ENOENT;
+
+	/*
+	 * The inode mutex is locked, but debugfs_create_dir() will also
+	 * take the mutex. As the instances directory can not be destroyed
+	 * or changed in any other way, it is safe to unlock it, and
+	 * let the dentry try. If two users try to make the same dir at
+	 * the same time, then the new_instance_create() will determine the
+	 * winner.
+	 */
+	mutex_unlock(&inode->i_mutex);
+
+	ret = new_instance_create(dentry->d_iname);
+
+	mutex_lock(&inode->i_mutex);
+
+	return ret;
+}
+
+static const struct inode_operations instance_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= instance_mkdir,
+};
+
+static __init void create_trace_instances(struct dentry *d_tracer)
+{
+	trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+	if (WARN_ON(!trace_instance_dir))
+		return;
+
+	/* Hijack the dir inode operations, to allow mkdir */
+	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
+}
+
 static void
 init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 {
@@ -5183,6 +5310,8 @@ static __init int tracer_init_debugfs(void)
 			  (void *)&global_trace.trace_cpu, &snapshot_fops);
 #endif
 
+	create_trace_instances(d_tracer);
+
 	create_trace_options_dir(&global_trace);
 
 	for_each_tracing_cpu(cpu)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 68cad7a9e089..883fe0b62f0a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -175,6 +175,7 @@ struct tracer;
 struct trace_array {
 	struct ring_buffer	*buffer;
 	struct list_head	list;
+	char			*name;
 	int			cpu;
 	int			buffer_disabled;
 	struct trace_cpu	trace_cpu;	/* place holder */
@@ -999,6 +1000,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 }
 
 extern void trace_event_enable_cmd_record(bool enable);
+extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 439955239bae..58a61302a733 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1754,16 +1754,22 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 	struct dentry *d_events;
 	struct dentry *entry;
 
+	mutex_lock(&event_mutex);
+
 	entry = debugfs_create_file("set_event", 0644, parent,
 				    tr, &ftrace_set_event_fops);
 	if (!entry) {
 		pr_warning("Could not create debugfs 'set_event' entry\n");
+		mutex_unlock(&event_mutex);
 		return -ENOMEM;
 	}
 
 	d_events = debugfs_create_dir("events", parent);
-	if (!d_events)
+	if (!d_events) {
 		pr_warning("Could not create debugfs 'events' directory\n");
+		mutex_unlock(&event_mutex);
+		return -ENOMEM;
+	}
 
 	/* ring buffer internal formats */
 	trace_create_file("header_page", 0444, d_events,
@@ -1778,7 +1784,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 			  tr, &ftrace_tr_enable_fops);
 
 	tr->event_dir = d_events;
+	down_write(&trace_event_mutex);
 	__trace_add_event_dirs(tr);
+	up_write(&trace_event_mutex);
+
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0c8916c34203734d3b05953ebace52d7c2969f16 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 7 Aug 2012 16:14:16 -0400
Subject: tracing: Add rmdir to remove multibuffer instances

Add a method to the hijacked dentry descriptor of the
"instances" directory to allow for rmdir to remove an
instance of a multibuffer.

Example:

  cd /debug/tracing/instances
  mkdir hello
  ls
hello/
  rmdir hello
  ls

Like the mkdir method, the i_mutex is dropped for the instances
directory. The instances directory is created at boot up and can
not be renamed or removed. The trace_types_lock mutex is used to
synchronize adding and removing of instances.

I've run several stress tests with different threads trying to
create and delete directories of the same name, and it has stood
up fine.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 68 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_events.c | 33 ++++++++++++++++++++++
 3 files changed, 102 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 07a63114d938..ab3df804fa96 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5192,6 +5192,42 @@ static int new_instance_create(const char *name)
 
 }
 
+static int instance_delete(const char *name)
+{
+	struct trace_array *tr;
+	int found = 0;
+	int ret;
+
+	mutex_lock(&trace_types_lock);
+
+	ret = -ENODEV;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr->name && strcmp(tr->name, name) == 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		goto out_unlock;
+
+	list_del(&tr->list);
+
+	event_trace_del_tracer(tr);
+	debugfs_remove_recursive(tr->dir);
+	free_percpu(tr->data);
+	ring_buffer_free(tr->buffer);
+
+	kfree(tr->name);
+	kfree(tr);
+
+	ret = 0;
+
+ out_unlock:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
 static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
 {
 	struct dentry *parent;
@@ -5219,9 +5255,41 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
 	return ret;
 }
 
+static int instance_rmdir(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *parent;
+	int ret;
+
+	/* Paranoid: Make sure the parent is the "instances" directory */
+	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+	if (WARN_ON_ONCE(parent != trace_instance_dir))
+		return -ENOENT;
+
+	/* The caller did a dget() on dentry */
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	/*
+	 * The inode mutex is locked, but debugfs_create_dir() will also
+	 * take the mutex. As the instances directory can not be destroyed
+	 * or changed in any other way, it is safe to unlock it, and
+	 * let the dentry try. If two users try to make the same dir at
+	 * the same time, then the instance_delete() will determine the
+	 * winner.
+	 */
+	mutex_unlock(&inode->i_mutex);
+
+	ret = instance_delete(dentry->d_iname);
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock(&dentry->d_inode->i_mutex);
+
+	return ret;
+}
+
 static const struct inode_operations instance_dir_inode_operations = {
 	.lookup		= simple_lookup,
 	.mkdir		= instance_mkdir,
+	.rmdir		= instance_rmdir,
 };
 
 static __init void create_trace_instances(struct dentry *d_tracer)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 883fe0b62f0a..b825ea2d8c64 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1001,6 +1001,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 
 extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
+extern int event_trace_del_tracer(struct trace_array *tr);
 
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 58a61302a733..06d6bc275221 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1709,6 +1709,20 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
+/* Remove the event directory structure for a trace directory. */
+static void
+__trace_remove_event_dirs(struct trace_array *tr)
+{
+	struct ftrace_event_file *file, *next;
+
+	list_for_each_entry_safe(file, next, &tr->events, list) {
+		list_del(&file->list);
+		debugfs_remove_recursive(file->dir);
+		remove_subsystem(file->system);
+		kfree(file);
+	}
+}
+
 static void
 __add_event_to_tracers(struct ftrace_event_call *call,
 		       struct ftrace_module_file_ops *file_ops)
@@ -1793,6 +1807,25 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 	return 0;
 }
 
+int event_trace_del_tracer(struct trace_array *tr)
+{
+	/* Disable any running events */
+	__ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+
+	mutex_lock(&event_mutex);
+
+	down_write(&trace_event_mutex);
+	__trace_remove_event_dirs(tr);
+	debugfs_remove_recursive(tr->event_dir);
+	up_write(&trace_event_mutex);
+
+	tr->event_dir = NULL;
+
+	mutex_unlock(&event_mutex);
+
+	return 0;
+}
+
 static __init int event_trace_enable(void)
 {
 	struct trace_array *tr = top_trace_array();
-- 
cgit v1.2.3


From 772482216f170ddc62fa92a3cc3271cdd1993525 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 27 Feb 2013 16:28:06 -0500
Subject: tracing: Get trace_events kernel command line working again

With the new descriptors used to allow multiple buffers in the
tracing directory added, the kernel command line parameter
trace_events=... no longer works. This is because the top level
(global) trace array now has a list of descriptors associated
with the events and the files in the debugfs directory. But in
early bootup, when the command line is processed and the events
enabled, the trace array list of events has not been set up yet.

Without the list of events in the trace array, the setting of
events to record will fail because it would not match any events.

The solution is to set up the top level array in two stages.
The first is to just add the ftrace file descriptors that just point
to the events. This will allow events to be enabled and start tracing.
The second stage is called after the filesystem is set up, and this
stage will create the debugfs event files and directories associated
with the trace array events.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 143 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 136 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 06d6bc275221..21fe83b4106a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1473,6 +1473,28 @@ __trace_add_new_event(struct ftrace_event_call *call,
 	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
 }
 
+/*
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
+ */
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+			    struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->event_call = call;
+	file->tr = tr;
+	list_add(&file->list, &tr->events);
+
+	return 0;
+}
+
 struct ftrace_module_file_ops;
 static void __add_event_to_tracers(struct ftrace_event_call *call,
 				   struct ftrace_module_file_ops *file_ops);
@@ -1709,6 +1731,56 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the debugfs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+	int ret;
+
+
+	list_for_each_entry(file, &tr->events, list) {
+		ret = event_create_dir(tr->event_dir, file,
+				       &ftrace_event_id_fops,
+				       &ftrace_enable_fops,
+				       &ftrace_event_filter_fops,
+				       &ftrace_event_format_fops);
+		if (ret < 0)
+			pr_warning("Could not create directory for event %s\n",
+				   file->event_call->name);
+	}
+}
+
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+	struct ftrace_event_call *call;
+	int ret;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		/* Early boot up should not have any modules loaded */
+		if (WARN_ON_ONCE(call->mod))
+			continue;
+
+		ret = __trace_early_add_new_event(call, tr);
+		if (ret < 0)
+			pr_warning("Could not create early event %s\n",
+				   call->name);
+	}
+}
+
 /* Remove the event directory structure for a trace directory. */
 static void
 __trace_remove_event_dirs(struct trace_array *tr)
@@ -1763,25 +1835,23 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
-int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 {
 	struct dentry *d_events;
 	struct dentry *entry;
 
-	mutex_lock(&event_mutex);
-
 	entry = debugfs_create_file("set_event", 0644, parent,
 				    tr, &ftrace_set_event_fops);
 	if (!entry) {
 		pr_warning("Could not create debugfs 'set_event' entry\n");
-		mutex_unlock(&event_mutex);
 		return -ENOMEM;
 	}
 
 	d_events = debugfs_create_dir("events", parent);
 	if (!d_events) {
 		pr_warning("Could not create debugfs 'events' directory\n");
-		mutex_unlock(&event_mutex);
 		return -ENOMEM;
 	}
 
@@ -1798,13 +1868,64 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 			  tr, &ftrace_tr_enable_fops);
 
 	tr->event_dir = d_events;
+
+	return 0;
+}
+
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+
+	ret = create_event_toplevel_files(parent, tr);
+	if (ret)
+		goto out_unlock;
+
 	down_write(&trace_event_mutex);
 	__trace_add_event_dirs(tr);
 	up_write(&trace_event_mutex);
 
+ out_unlock:
 	mutex_unlock(&event_mutex);
 
-	return 0;
+	return ret;
+}
+
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+
+	ret = create_event_toplevel_files(parent, tr);
+	if (ret)
+		goto out_unlock;
+
+	down_write(&trace_event_mutex);
+	__trace_early_add_event_dirs(tr);
+	up_write(&trace_event_mutex);
+
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
 }
 
 int event_trace_del_tracer(struct trace_array *tr)
@@ -1842,6 +1963,14 @@ static __init int event_trace_enable(void)
 			list_add(&call->list, &ftrace_events);
 	}
 
+	/*
+	 * We need the top trace array to have a working set of trace
+	 * points at early init, before the debug files and directories
+	 * are created. Create the file entries now, and attach them
+	 * to the actual file dentries later.
+	 */
+	__trace_early_add_events(tr);
+
 	while (true) {
 		token = strsep(&buf, ",");
 
@@ -1882,7 +2011,7 @@ static __init int event_trace_init(void)
 	if (trace_define_common_fields())
 		pr_warning("tracing: Failed to allocate common fields");
 
-	ret = event_trace_add_tracer(d_tracer, tr);
+	ret = early_event_add_tracer(d_tracer, tr);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d1a291437f75f6c841819b7855d95a21958cc822 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 27 Feb 2013 20:23:57 -0500
Subject: tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c

The event structures used by the trace events are mostly persistent,
but they are also allocated by kmalloc, which is not the best at
allocating space for what is used. By converting these kmallocs
into kmem_cache_allocs, we can save over 50K of space that is
permanently allocated.

After boot we have:

 slab name          active allocated size
 ---------          ------ --------- ----
ftrace_event_file    979   1005     56   67    1
ftrace_event_field   2301   2310     48   77    1

The ftrace_event_file has at boot up 979 active objects out of
1005 allocated in the slabs. Each object is 56 bytes. In a normal
kmalloc, that would allocate 64 bytes for each object.

 1005 - 979  = 26 objects not used
 26 * 56 = 1456 bytes wasted

But if we used kmalloc:

 64 - 56 = 8 bytes unused per allocation
 8 * 979 = 7832 bytes wasted

 7832 - 1456 = 6376 bytes in savings

Doing the same for ftrace_event_field where there's 2301 objects
allocated in a slab that can hold 2310 with 48 bytes each we have:

 2310 - 2301 = 9 objects not used
 9 * 48 = 432 bytes wasted

A kmalloc would also use 64 bytes per object:

 64 - 48 = 16 bytes unused per allocation
 16 * 2301 = 36816 bytes wasted!

 36816 - 432 = 36384 bytes in savings

This change gives us a total of 42760 bytes in savings. At least
on my machine, but as there's a lot of these persistent objects
for all configurations that use trace points, this is a net win.

Thanks to Ezequiel Garcia for his trace_analyze presentation which
pointed out the wasted space in my code.

Cc: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 21fe83b4106a..5d8845d36fa8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -36,6 +36,11 @@ EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
 
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
+
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+
 /* Double loops, do not use break, only goto's work */
 #define do_for_each_event_file(tr, file)			\
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
@@ -63,7 +68,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 {
 	struct ftrace_event_field *field;
 
-	field = kzalloc(sizeof(*field), GFP_KERNEL);
+	field = kmem_cache_alloc(field_cachep, GFP_TRACE);
 	if (!field)
 		goto err;
 
@@ -91,7 +96,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 err:
 	if (field)
 		kfree(field->name);
-	kfree(field);
+	kmem_cache_free(field_cachep, field);
 
 	return -ENOMEM;
 }
@@ -143,7 +148,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
 		list_del(&field->link);
 		kfree(field->type);
 		kfree(field->name);
-		kfree(field);
+		kmem_cache_free(field_cachep, field);
 	}
 }
 
@@ -1383,7 +1388,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
 		list_del(&file->list);
 		debugfs_remove_recursive(file->dir);
 		remove_subsystem(file->system);
-		kfree(file);
+		kmem_cache_free(file_cachep, file);
 
 		/*
 		 * The do_for_each_event_file_safe() is
@@ -1462,7 +1467,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
 		return -ENOMEM;
 
@@ -1484,7 +1489,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
 		return -ENOMEM;
 
@@ -1791,7 +1796,7 @@ __trace_remove_event_dirs(struct trace_array *tr)
 		list_del(&file->list);
 		debugfs_remove_recursive(file->dir);
 		remove_subsystem(file->system);
-		kfree(file);
+		kmem_cache_free(file_cachep, file);
 	}
 }
 
@@ -1947,6 +1952,13 @@ int event_trace_del_tracer(struct trace_array *tr)
 	return 0;
 }
 
+static __init int event_trace_memsetup(void)
+{
+	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+	return 0;
+}
+
 static __init int event_trace_enable(void)
 {
 	struct trace_array *tr = top_trace_array();
@@ -2021,6 +2033,7 @@ static __init int event_trace_init(void)
 
 	return 0;
 }
+early_initcall(event_trace_memsetup);
 core_initcall(event_trace_enable);
 fs_initcall(event_trace_init);
 
-- 
cgit v1.2.3


From 92edca073c374f66b8eee20ec6426fb0cdb6c4d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 27 Feb 2013 20:41:37 -0500
Subject: tracing: Use direct field, type and system names

The names used to display the field and type in the event format
files are copied, as well as the system name that is displayed.

All these names are created by constant values passed in.
If one of theses values were to be removed by a module, the module
would also be required to remove any event it created.

By using the strings directly, we can save over 100K of memory.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |  4 ++--
 kernel/trace/trace_events.c | 20 +++-----------------
 2 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b825ea2d8c64..e420f2a230de 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -887,8 +887,8 @@ enum {
 
 struct ftrace_event_field {
 	struct list_head	link;
-	char			*name;
-	char			*type;
+	const char		*name;
+	const char		*type;
 	int			filter_type;
 	int			offset;
 	int			size;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d8845d36fa8..63b4bdf84593 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -72,13 +72,8 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	if (!field)
 		goto err;
 
-	field->name = kstrdup(name, GFP_KERNEL);
-	if (!field->name)
-		goto err;
-
-	field->type = kstrdup(type, GFP_KERNEL);
-	if (!field->type)
-		goto err;
+	field->name = name;
+	field->type = type;
 
 	if (filter_type == FILTER_OTHER)
 		field->filter_type = filter_assign_type(type);
@@ -94,8 +89,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	return 0;
 
 err:
-	if (field)
-		kfree(field->name);
 	kmem_cache_free(field_cachep, field);
 
 	return -ENOMEM;
@@ -146,8 +139,6 @@ void trace_destroy_fields(struct ftrace_event_call *call)
 	head = trace_get_fields(call);
 	list_for_each_entry_safe(field, next, head, link) {
 		list_del(&field->link);
-		kfree(field->type);
-		kfree(field->name);
 		kmem_cache_free(field_cachep, field);
 	}
 }
@@ -286,7 +277,6 @@ static void __put_system(struct event_subsystem *system)
 		kfree(filter->filter_string);
 		kfree(filter);
 	}
-	kfree(system->name);
 	kfree(system);
 }
 
@@ -1202,10 +1192,7 @@ create_new_subsystem(const char *name)
 		return NULL;
 
 	system->ref_count = 1;
-	system->name = kstrdup(name, GFP_KERNEL);
-
-	if (!system->name)
-		goto out_free;
+	system->name = name;
 
 	system->filter = NULL;
 
@@ -1218,7 +1205,6 @@ create_new_subsystem(const char *name)
 	return system;
 
  out_free:
-	kfree(system->name);
 	kfree(system);
 	return NULL;
 }
-- 
cgit v1.2.3


From 189e5784f6c5e001a84127b83f03bc76a8bfb1ec Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Feb 2013 20:03:06 -0500
Subject: tracing: Do not block on splice if either file or splice NONBLOCK
 flag is set

Currently only the splice NONBLOCK flag is checked to determine if
the splice read should block or not. But the file descriptor NONBLOCK
flag also needs to be checked.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ab3df804fa96..598a7aa7d0ae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4593,7 +4593,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
-		if (flags & SPLICE_F_NONBLOCK)
+		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
 			ret = -EAGAIN;
 		else
 			ret = 0;
-- 
cgit v1.2.3


From cc60cdc952be09bca5b0bff9fefc7aa6185c3049 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 Feb 2013 09:17:16 -0500
Subject: tracing: Fix polling on trace_pipe_raw

The trace_pipe_raw never implemented polling and this was casing
issues for several utilities. This is now implemented.

Blocked reads still are on the TODO list.

Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Tested-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 78 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 27 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 598a7aa7d0ae..4a6e461273a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3555,10 +3555,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 }
 
 static unsigned int
-tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-	struct trace_iterator *iter = filp->private_data;
-
 	if (trace_flags & TRACE_ITER_BLOCK) {
 		/*
 		 * Always select as readable when in blocking mode
@@ -3567,6 +3565,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	} else {
 		if (!trace_empty(iter))
 			return POLLIN | POLLRDNORM;
+		trace_wakeup_needed = true;
 		poll_wait(filp, &trace_wait, poll_table);
 		if (!trace_empty(iter))
 			return POLLIN | POLLRDNORM;
@@ -3575,6 +3574,14 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	return trace_poll(iter, filp, poll_table);
+}
+
 /*
  * This is a make-shift waitqueue.
  * A tracer might use this callback on some rare cases:
@@ -4362,9 +4369,8 @@ static const struct file_operations snapshot_fops = {
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 struct ftrace_buffer_info {
-	struct trace_array	*tr;
+	struct trace_iterator	iter;
 	void			*spare;
-	int			cpu;
 	unsigned int		read;
 };
 
@@ -4381,22 +4387,32 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	if (!info)
 		return -ENOMEM;
 
-	info->tr	= tr;
-	info->cpu	= tc->cpu;
-	info->spare	= NULL;
+	info->iter.tr		= tr;
+	info->iter.cpu_file	= tc->cpu;
+	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
-	info->read	= (unsigned int)-1;
+	info->read		= (unsigned int)-1;
 
 	filp->private_data = info;
 
 	return nonseekable_open(inode, filp);
 }
 
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	return trace_poll(iter, filp, poll_table);
+}
+
 static ssize_t
 tracing_buffers_read(struct file *filp, char __user *ubuf,
 		     size_t count, loff_t *ppos)
 {
 	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
 	ssize_t ret;
 	size_t size;
 
@@ -4404,7 +4420,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return 0;
 
 	if (!info->spare)
-		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
+		info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file);
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -4412,12 +4428,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (info->read < PAGE_SIZE)
 		goto read;
 
-	trace_access_lock(info->cpu);
-	ret = ring_buffer_read_page(info->tr->buffer,
+	trace_access_lock(iter->cpu_file);
+	ret = ring_buffer_read_page(iter->tr->buffer,
 				    &info->spare,
 				    count,
-				    info->cpu, 0);
-	trace_access_unlock(info->cpu);
+				    iter->cpu_file, 0);
+	trace_access_unlock(iter->cpu_file);
 	if (ret < 0)
 		return 0;
 
@@ -4442,9 +4458,10 @@ read:
 static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
 	struct ftrace_buffer_info *info = file->private_data;
+	struct trace_iterator *iter = &info->iter;
 
 	if (info->spare)
-		ring_buffer_free_read_page(info->tr->buffer, info->spare);
+		ring_buffer_free_read_page(iter->tr->buffer, info->spare);
 	kfree(info);
 
 	return 0;
@@ -4511,6 +4528,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
+	struct trace_iterator *iter = &info->iter;
 	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
@@ -4541,8 +4559,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
-	trace_access_lock(info->cpu);
-	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ again:
+	trace_access_lock(iter->cpu_file);
+	entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
 
 	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
@@ -4553,15 +4572,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 
 		ref->ref = 1;
-		ref->buffer = info->tr->buffer;
-		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
+		ref->buffer = iter->tr->buffer;
+		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
 		if (!ref->page) {
 			kfree(ref);
 			break;
 		}
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 1);
+					  len, iter->cpu_file, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer, ref->page);
 			kfree(ref);
@@ -4585,20 +4604,24 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
 
-		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+		entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
 	}
 
-	trace_access_unlock(info->cpu);
+	trace_access_unlock(iter->cpu_file);
 	spd.nr_pages = i;
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
-		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
+		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
 			ret = -EAGAIN;
-		else
-			ret = 0;
-		/* TODO: block */
-		goto out;
+			goto out;
+		}
+		default_wait_pipe(iter);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
+		goto again;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
@@ -4610,6 +4633,7 @@ out:
 static const struct file_operations tracing_buffers_fops = {
 	.open		= tracing_buffers_open,
 	.read		= tracing_buffers_read,
+	.poll		= tracing_buffers_poll,
 	.release	= tracing_buffers_release,
 	.splice_read	= tracing_buffers_splice_read,
 	.llseek		= no_llseek,
-- 
cgit v1.2.3


From b627344fef0c38fa4e3050348e168e46db87c905 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 Feb 2013 13:44:11 -0500
Subject: tracing: Fix read blocking on trace_pipe_raw

If the ring buffer is empty, a read to trace_pipe_raw wont block.
The tracing code has the infrastructure to wake up waiting readers,
but the trace_pipe_raw doesn't take advantage of that.

When a read is done to trace_pipe_raw without the O_NONBLOCK flag
set, have the read block until there's data in the requested buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a6e461273a9..3ec146c96df4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4389,6 +4389,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	info->iter.tr		= tr;
 	info->iter.cpu_file	= tc->cpu;
+	info->iter.trace	= tr->current_trace;
 	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
 	info->read		= (unsigned int)-1;
@@ -4428,18 +4429,29 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (info->read < PAGE_SIZE)
 		goto read;
 
+ again:
 	trace_access_lock(iter->cpu_file);
 	ret = ring_buffer_read_page(iter->tr->buffer,
 				    &info->spare,
 				    count,
 				    iter->cpu_file, 0);
 	trace_access_unlock(iter->cpu_file);
-	if (ret < 0)
+
+	if (ret < 0) {
+		if (trace_empty(iter)) {
+			if ((filp->f_flags & O_NONBLOCK))
+				return -EAGAIN;
+			iter->trace->wait_pipe(iter);
+			if (signal_pending(current))
+				return -EINTR;
+			goto again;
+		}
 		return 0;
+	}
 
 	info->read = 0;
 
-read:
+ read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
 		size = count;
@@ -4616,7 +4628,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			ret = -EAGAIN;
 			goto out;
 		}
-		default_wait_pipe(iter);
+		iter->trace->wait_pipe(iter);
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
-- 
cgit v1.2.3


From 15693458c4bc0693fd63a50d60f35b628fcf4e29 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Feb 2013 19:59:17 -0500
Subject: tracing/ring-buffer: Move poll wake ups into ring buffer code

Move the logic to wake up on ring buffer data into the ring buffer
code itself. This simplifies the tracing code a lot and also has the
added benefit that waiters on one of the instance buffers can be woken
only when data is added to that instance instead of data added to
any instance.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 146 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c       |  83 ++++----------------------
 2 files changed, 158 insertions(+), 71 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7244acde77b0..56b6ea32d2e7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
+#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
@@ -442,6 +443,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	return ret;
 }
 
+struct rb_irq_work {
+	struct irq_work			work;
+	wait_queue_head_t		waiters;
+	bool				waiters_pending;
+};
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -476,6 +483,8 @@ struct ring_buffer_per_cpu {
 	struct list_head		new_pages; /* new pages to add */
 	struct work_struct		update_pages_work;
 	struct completion		update_done;
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer {
@@ -495,6 +504,8 @@ struct ring_buffer {
 	struct notifier_block		cpu_notify;
 #endif
 	u64				(*clock)(void);
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer_iter {
@@ -506,6 +517,118 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+	wake_up_all(&rbwork->waiters);
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	DEFINE_WAIT(wait);
+	struct rb_irq_work *work;
+
+	/*
+	 * Depending on what the caller is waiting for, either any
+	 * data in any cpu buffer, or a specific buffer, put the
+	 * caller on the appropriate wait queue.
+	 */
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+
+	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+	/*
+	 * The events can happen in critical sections where
+	 * checking a work queue can cause deadlocks.
+	 * After adding a task to the queue, this flag is set
+	 * only to notify events to try to wake up the queue
+	 * using irq_work.
+	 *
+	 * We don't clear it even if the buffer is no longer
+	 * empty. The flag only causes the next event to run
+	 * irq_work to do the work queue wake up. The worse
+	 * that can happen if we race with !trace_empty() is that
+	 * an event will cause an irq_work to try to wake up
+	 * an empty queue.
+	 *
+	 * There's no reason to protect this flag either, as
+	 * the work queue and irq_work logic will do the necessary
+	 * synchronization for the wake ups. The only thing
+	 * that is necessary is that the wake up happens after
+	 * a task has been queued. It's OK for spurious wake ups.
+	 */
+	work->waiters_pending = true;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+		schedule();
+
+	finish_wait(&work->waiters, &wait);
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+			  struct file *filp, poll_table *poll_table)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct rb_irq_work *work;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+	work->waiters_pending = true;
+	poll_wait(filp, &work->waiters, poll_table);
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(b, cond)						\
 	({								\
@@ -1061,6 +1184,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
 	init_completion(&cpu_buffer->update_done);
+	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -1156,6 +1280,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
 
+	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+
 	/* need at least two pages */
 	if (nr_pages < 2)
 		nr_pages = 2;
@@ -2610,6 +2736,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	rb_end_commit(cpu_buffer);
 }
 
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+	if (buffer->irq_work.waiters_pending) {
+		buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&buffer->irq_work.work);
+	}
+
+	if (cpu_buffer->irq_work.waiters_pending) {
+		cpu_buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+}
+
 /**
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
@@ -2629,6 +2771,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	trace_recursive_unlock();
 
 	preempt_enable_notrace();
@@ -2801,6 +2945,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	ret = 0;
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec146c96df4..b5b25b6575a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
-#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -86,14 +85,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  */
 static DEFINE_PER_CPU(bool, trace_cmdline_save);
 
-/*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-
-static struct irq_work trace_work_wakeup;
-
 /*
  * Kill all tracing for good (never come back).
  * It is initialized to 1 but will turn to zero if the initialization
@@ -334,28 +325,12 @@ static inline void trace_access_lock_init(void)
 
 #endif
 
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
 	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
 	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
 
-/**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
-{
-	wake_up_all(&trace_wait);
-
-}
-
 /**
  * tracing_on - enable tracing buffers
  *
@@ -763,36 +738,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 static void default_wait_pipe(struct trace_iterator *iter)
 {
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-
-	/*
-	 * The events can happen in critical sections where
-	 * checking a work queue can cause deadlocks.
-	 * After adding a task to the queue, this flag is set
-	 * only to notify events to try to wake up the queue
-	 * using irq_work.
-	 *
-	 * We don't clear it even if the buffer is no longer
-	 * empty. The flag only causes the next event to run
-	 * irq_work to do the work queue wake up. The worse
-	 * that can happen if we race with !trace_empty() is that
-	 * an event will cause an irq_work to try to wake up
-	 * an empty queue.
-	 *
-	 * There's no reason to protect this flag either, as
-	 * the work queue and irq_work logic will do the necessary
-	 * synchronization for the wake ups. The only thing
-	 * that is necessary is that the wake up happens after
-	 * a task has been queued. It's OK for spurious wake ups.
-	 */
-	trace_wakeup_needed = true;
-
-	if (trace_empty(iter))
-		schedule();
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return;
 
-	finish_wait(&trace_wait, &wait);
+	ring_buffer_wait(iter->tr->buffer, iter->cpu_file);
 }
 
 /**
@@ -1262,11 +1212,6 @@ void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
 	__this_cpu_write(trace_cmdline_save, true);
-	if (trace_wakeup_needed) {
-		trace_wakeup_needed = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&trace_work_wakeup);
-	}
 	ring_buffer_unlock_commit(buffer, event);
 }
 
@@ -3557,21 +3502,18 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 static unsigned int
 trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-	if (trace_flags & TRACE_ITER_BLOCK) {
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return POLLIN | POLLRDNORM;
+
+	if (trace_flags & TRACE_ITER_BLOCK)
 		/*
 		 * Always select as readable when in blocking mode
 		 */
 		return POLLIN | POLLRDNORM;
-	} else {
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-		trace_wakeup_needed = true;
-		poll_wait(filp, &trace_wait, poll_table);
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-
-		return 0;
-	}
+	else
+		return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file,
+					     filp, poll_table);
 }
 
 static unsigned int
@@ -5701,7 +5643,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	trace_init_cmdlines();
-	init_irq_work(&trace_work_wakeup, trace_wake_up);
 
 	register_tracer(&nop_trace);
 
-- 
cgit v1.2.3


From f71130de5c7fba92faf3901784714e37a234c08f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 21 Feb 2013 10:32:38 +0800
Subject: tracing: Add a helper function for event print functions

Move duplicate code in event print functions to a helper function.

This shrinks the size of the kernel by ~13K.

   text    data     bss     dec     hex filename
6596137 1743966 10138672        18478775        119f6b7 vmlinux.o.old
6583002 1743849 10138672        18465523        119c2f3 vmlinux.o.new

Link: http://lkml.kernel.org/r/51258746.2060304@huawei.com

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..aa92ac322ba2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -397,6 +397,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+			   struct trace_event *trace_event)
+{
+	struct ftrace_event_call *event;
+	struct trace_seq *s = &iter->seq;
+	struct trace_seq *p = &iter->tmp_seq;
+	struct trace_entry *entry;
+	int ret;
+
+	event = container_of(trace_event, struct ftrace_event_call, event);
+	entry = iter->ent;
+
+	if (entry->type != event->event.type) {
+		WARN_ON_ONCE(1);
+		return TRACE_TYPE_UNHANDLED;
+	}
+
+	trace_seq_init(p);
+	ret = trace_seq_printf(s, "%s: ", event->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3


From 7e4f44b153e1ec07bb64c1c1671cdf492465bbf3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 21 Feb 2013 10:33:33 +0800
Subject: tracing: Annotate event field-defining functions with __init

Those functions are called either during kernel boot or module init.

Before:

$ dmesg | grep 'Freeing unused kernel memory'
Freeing unused kernel memory: 1208k freed
Freeing unused kernel memory: 1360k freed
Freeing unused kernel memory: 1960k freed

After:

$ dmesg | grep 'Freeing unused kernel memory'
Freeing unused kernel memory: 1236k freed
Freeing unused kernel memory: 1388k freed
Freeing unused kernel memory: 1960k freed

Link: http://lkml.kernel.org/r/5125877D.5000201@huawei.com

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..4f6a91c1370c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\
-int									\
+static int __init							\
 ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 {									\
 	struct struct_name field;					\
-- 
cgit v1.2.3


From b8aae39fc54a2e297698288ac48237cc4c6f83bb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 21 Feb 2013 10:33:58 +0800
Subject: tracing/syscalls: Annotate field-defining functions with __init

These two functions are called during kernel boot only.

Link: http://lkml.kernel.org/r/51258796.7020704@huawei.com

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a842783ad6be..00b5c3e6fbbe 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -261,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
 		kfree(call->print_fmt);
 }
 
-static int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_enter trace;
 	struct syscall_metadata *meta = call->data;
@@ -284,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-static int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_exit trace;
 	int ret;
-- 
cgit v1.2.3


From 34ef61b1fa6172e994e441f1f0241dc53a75bd5f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Sat, 2 Mar 2013 16:49:10 -0500
Subject: tracing: Add __per_cpu annotation to trace array percpu data pointer

With the conversion of the data array to per cpu, sparse now complains
about the use of per_cpu_ptr() on the variable. But The variable is
allocated with alloc_percpu() and is fine to use. But since the structure
that contains the data variable does not annotate it as such, sparse
gives out a lot of false warnings.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e420f2a230de..6728a249e817 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -198,7 +198,7 @@ struct trace_array {
 	struct list_head	systems;
 	struct list_head	events;
 	struct task_struct	*waiter;
-	struct trace_array_cpu	*data;
+	struct trace_array_cpu __percpu	*data;
 };
 
 enum {
-- 
cgit v1.2.3


From 315326c16ad08771fe0f075a08a18c99976f29f5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Sat, 2 Mar 2013 17:37:14 -0500
Subject: tracing: Fix trace events build without modules

The new multi-buffers added a descriptor that kept track of module
events, and the directories they use, with struct ftace_module_file_ops.
This is used to add a ref count to keep modules from unloading while
their files are being accessed.

As the descriptor is only needed when CONFIG_MODULES is enabled, it
is only declared when the config is enabled. But that struct is
dereferenced in a few areas outside the #ifdef CONFIG_MODULES.

By adding some helper routines and moving code around a little,
events can be compiled again without modules.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 55 +++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 19 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 63b4bdf84593..0f1307a29fcf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1546,9 +1546,18 @@ struct ftrace_module_file_ops {
 	struct file_operations		filter;
 };
 
-static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+static struct ftrace_module_file_ops *
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
 {
-	struct ftrace_module_file_ops *file_ops;
+	/*
+	 * As event_calls are added in groups by module,
+	 * when we find one file_ops, we don't need to search for
+	 * each call in that module, as the rest should be the
+	 * same. Only search for a new one if the last one did
+	 * not match.
+	 */
+	if (file_ops && mod == file_ops->mod)
+		return file_ops;
 
 	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
 		if (file_ops->mod == mod)
@@ -1664,16 +1673,35 @@ static int trace_module_notify(struct notifier_block *self,
 
 	return 0;
 }
+
+static int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+			  struct trace_array *tr,
+			  struct ftrace_module_file_ops *file_ops)
+{
+	return __trace_add_new_event(call, tr,
+				     &file_ops->id, &file_ops->enable,
+				     &file_ops->filter, &file_ops->format);
+}
+
 #else
-static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+static inline struct ftrace_module_file_ops *
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
 {
 	return NULL;
 }
-static int trace_module_notify(struct notifier_block *self,
-			       unsigned long val, void *data)
+static inline int trace_module_notify(struct notifier_block *self,
+				      unsigned long val, void *data)
 {
 	return 0;
 }
+static inline int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+			  struct trace_array *tr,
+			  struct ftrace_module_file_ops *file_ops)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_MODULES */
 
 /* Create a new event directory structure for a trace directory. */
@@ -1692,20 +1720,11 @@ __trace_add_event_dirs(struct trace_array *tr)
 			 * want the module to disappear when reading one
 			 * of these files). The file_ops keep account of
 			 * the module ref count.
-			 *
-			 * As event_calls are added in groups by module,
-			 * when we find one file_ops, we don't need to search for
-			 * each call in that module, as the rest should be the
-			 * same. Only search for a new one if the last one did
-			 * not match.
 			 */
-			if (!file_ops || call->mod != file_ops->mod)
-				file_ops = find_ftrace_file_ops(call->mod);
+			file_ops = find_ftrace_file_ops(file_ops, call->mod);
 			if (!file_ops)
 				continue; /* Warn? */
-			ret = __trace_add_new_event(call, tr,
-					&file_ops->id, &file_ops->enable,
-					&file_ops->filter, &file_ops->format);
+			ret = __trace_add_new_mod_event(call, tr, file_ops);
 			if (ret < 0)
 				pr_warning("Could not create directory for event %s\n",
 					   call->name);
@@ -1794,9 +1813,7 @@ __add_event_to_tracers(struct ftrace_event_call *call,
 
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (file_ops)
-			__trace_add_new_event(call, tr,
-					      &file_ops->id, &file_ops->enable,
-					      &file_ops->filter, &file_ops->format);
+			__trace_add_new_mod_event(call, tr, file_ops);
 		else
 			__trace_add_new_event(call, tr,
 					      &ftrace_event_id_fops,
-- 
cgit v1.2.3


From 523c81135bb23b2d9a8c21365d90d21b1309c138 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 4 Mar 2013 14:15:59 +0800
Subject: tracing: Fix some section mismatch warnings

As we've added __init annotation to field-defining functions, we should
add __refdata annotation to event_call variables, which reference those
functions.

Link: http://lkml.kernel.org/r/51343C1F.2050502@huawei.com

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c   | 2 +-
 kernel/trace/trace_syscalls.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4f6a91c1370c..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
 			 regfn)						\
 									\
-struct ftrace_event_class event_class_ftrace_##call = {			\
+struct ftrace_event_class __refdata event_class_ftrace_##call = {	\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.define_fields		= ftrace_define_fields_##call,		\
 	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 00b5c3e6fbbe..1cd37ffb4093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -479,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
 	.trace		= print_syscall_exit,
 };
 
-struct ftrace_event_class event_class_syscall_enter = {
+struct ftrace_event_class __refdata event_class_syscall_enter = {
 	.system		= "syscalls",
 	.reg		= syscall_enter_register,
 	.define_fields	= syscall_enter_define_fields,
@@ -487,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
 	.raw_init	= init_syscall_trace,
 };
 
-struct ftrace_event_class event_class_syscall_exit = {
+struct ftrace_event_class __refdata event_class_syscall_exit = {
 	.system		= "syscalls",
 	.reg		= syscall_exit_register,
 	.define_fields	= syscall_exit_define_fields,
-- 
cgit v1.2.3


From f1dc6725882b5ca54eb9a04436a3b47d58f2cbc7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Mon, 4 Mar 2013 17:33:05 -0500
Subject: ring-buffer: Init waitqueue for blocked readers

The move of blocked readers to the ring buffer left out the
init of the wait queue that is used. Tests missed this due to running
stress tests against the buffers, which didn't allow for any
readers to end up waiting. Running a simple read and wait triggered
a bug.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 56b6ea32d2e7..65fe2a4f9824 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1185,6 +1185,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
 	init_completion(&cpu_buffer->update_done);
 	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
+	init_waitqueue_head(&cpu_buffer->irq_work.waiters);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -1281,6 +1282,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	buffer->reader_lock_key = key;
 
 	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+	init_waitqueue_head(&buffer->irq_work.waiters);
 
 	/* need at least two pages */
 	if (nr_pages < 2)
-- 
cgit v1.2.3


From 575380da8b46969a2c6a7e14a51056a63b30fe2e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Mon, 4 Mar 2013 23:05:12 -0500
Subject: tracing: Only clear trace buffer on module unload if event was traced

Currently, when a module with events is unloaded, the trace buffer is
cleared. This is just a safety net in case the module might have some
strange callback when its event is outputted. But there's no reason
to reset the buffer if the module didn't have any of its events traced.

Add a flag to the event "call" structure called WAS_ENABLED and gets set
when the event is ever enabled, and this flag never gets cleared. When a
module gets unloaded, if any of its events have this flag set, then the
trace buffer will get cleared.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0f1307a29fcf..9a7dc4bf1171 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -245,6 +245,9 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 				break;
 			}
 			file->flags |= FTRACE_EVENT_FL_ENABLED;
+
+			/* WAS_ENABLED gets set but never cleared. */
+			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
 		}
 		break;
 	}
@@ -1626,12 +1629,13 @@ static void trace_module_remove_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
-	bool found = false;
+	bool clear_trace = false;
 
 	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
-			found = true;
+			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+				clear_trace = true;
 			__trace_remove_event_call(call);
 		}
 	}
@@ -1648,9 +1652,9 @@ static void trace_module_remove_events(struct module *mod)
 
 	/*
 	 * It is safest to reset the ring buffer if the module being unloaded
-	 * registered any events.
+	 * registered any events that were used.
 	 */
-	if (found)
+	if (clear_trace)
 		tracing_reset_current_online_cpus();
 	up_write(&trace_event_mutex);
 }
-- 
cgit v1.2.3


From 873c642f5964b260480850040dec21e42d0ae4e4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Mon, 4 Mar 2013 23:26:06 -0500
Subject: tracing: Clear all trace buffers when unloaded module event was used

Currently we do not know what buffer a module event was enabled in.
On unload, it is safest to clear all buffer instances, not just the
top level buffer.

Todo: Clear only the buffer that the event was used in. The
infrastructure is there to do this, but it makes the code a bit
more complex. Lets get the current code vetted before we add that.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 10 ++++++++--
 kernel/trace/trace.h        |  2 +-
 kernel/trace/trace_events.c | 10 +++++++---
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b5b25b6575a9..c8a852a55db4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -912,9 +912,15 @@ void tracing_reset_current(int cpu)
 	tracing_reset(&global_trace, cpu);
 }
 
-void tracing_reset_current_online_cpus(void)
+void tracing_reset_all_online_cpus(void)
 {
-	tracing_reset_online_cpus(&global_trace);
+	struct trace_array *tr;
+
+	mutex_lock(&trace_types_lock);
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		tracing_reset_online_cpus(tr);
+	}
+	mutex_unlock(&trace_types_lock);
 }
 
 #define SAVED_CMDLINES 128
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6728a249e817..fa60b2977524 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -496,7 +496,7 @@ int tracing_is_enabled(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 void tracing_reset_current(int cpu);
-void tracing_reset_current_online_cpus(void);
+void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
 				 umode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9a7dc4bf1171..a376ab5eec5c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1649,14 +1649,18 @@ static void trace_module_remove_events(struct module *mod)
 		list_del(&file_ops->list);
 		kfree(file_ops);
 	}
+	up_write(&trace_event_mutex);
 
 	/*
 	 * It is safest to reset the ring buffer if the module being unloaded
-	 * registered any events that were used.
+	 * registered any events that were used. The only worry is if
+	 * a new module gets loaded, and takes on the same id as the events
+	 * of this module. When printing out the buffer, traced events left
+	 * over from this module may be passed to the new module events and
+	 * unexpected results may occur.
 	 */
 	if (clear_trace)
-		tracing_reset_current_online_cpus();
-	up_write(&trace_event_mutex);
+		tracing_reset_all_online_cpus();
 }
 
 static int trace_module_notify(struct notifier_block *self,
-- 
cgit v1.2.3


From 22cffc2bb4a50d8c56f03c56f9f19dea85b78e30 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 07:30:24 -0500
Subject: tracing: Enable snapshot when any latency tracer is enabled

The snapshot utility is extremely useful, and does not add any more
overhead in memory when another latency tracer is enabled. They use
the snapshot underneath. There's no reason to hide the snapshot file
when a latency tracer has been enabled in the kernel.

If any of the latency tracers (irq, preempt or wakeup) is enabled
then also select the snapshot facility.

Note, snapshot can be enabled without the latency tracers enabled.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b516a8e19d51..590a27fc212f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -191,6 +191,7 @@ config IRQSOFF_TRACER
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	select RING_BUFFER_ALLOW_SWAP
+	select TRACER_SNAPSHOT
 	help
 	  This option measures the time spent in irqs-off critical
 	  sections, with microsecond accuracy.
@@ -213,6 +214,7 @@ config PREEMPT_TRACER
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	select RING_BUFFER_ALLOW_SWAP
+	select TRACER_SNAPSHOT
 	help
 	  This option measures the time spent in preemption-off critical
 	  sections, with microsecond accuracy.
@@ -232,6 +234,7 @@ config SCHED_TRACER
 	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
+	select TRACER_SNAPSHOT
 	help
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
-- 
cgit v1.2.3


From 12883efb670c28dff57dcd7f4f995a1ffe153b2d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 09:24:35 -0500
Subject: tracing: Consolidate max_tr into main trace_array structure

Currently, the way the latency tracers and snapshot feature works
is to have a separate trace_array called "max_tr" that holds the
snapshot buffer. For latency tracers, this snapshot buffer is used
to swap the running buffer with this buffer to save the current max
latency.

The only items needed for the max_tr is really just a copy of the buffer
itself, the per_cpu data pointers, the time_start timestamp that states
when the max latency was triggered, and the cpu that the max latency
was triggered on. All other fields in trace_array are unused by the
max_tr, making the max_tr mostly bloat.

This change removes the max_tr completely, and adds a new structure
called trace_buffer, that holds the buffer pointer, the per_cpu data
pointers, the time_start timestamp, and the cpu where the latency occurred.

The trace_array, now has two trace_buffers, one for the normal trace and
one for the max trace or snapshot. By doing this, not only do we remove
the bloat from the max_trace but the instances of traces can now use
their own snapshot feature and not have just the top level global_trace have
the snapshot feature and latency tracers for itself.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/blktrace.c              |   4 +-
 kernel/trace/trace.c                 | 486 +++++++++++++++++++----------------
 kernel/trace/trace.h                 |  37 ++-
 kernel/trace/trace_functions.c       |   8 +-
 kernel/trace/trace_functions_graph.c |  12 +-
 kernel/trace/trace_irqsoff.c         |  10 +-
 kernel/trace/trace_kdb.c             |   8 +-
 kernel/trace/trace_mmiotrace.c       |  12 +-
 kernel/trace/trace_output.c          |   2 +-
 kernel/trace/trace_sched_switch.c    |   8 +-
 kernel/trace/trace_sched_wakeup.c    |  16 +-
 kernel/trace/trace_selftest.c        |  42 +--
 kernel/trace/trace_syscalls.c        |   4 +-
 13 files changed, 363 insertions(+), 286 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 71259e2b6b61..90a55054744c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	bool blk_tracer = blk_tracer_enabled;
 
 	if (blk_tracer) {
-		buffer = blk_tr->buffer;
+		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	if (blk_tracer) {
 		tracing_record_cmdline(current);
 
-		buffer = blk_tr->buffer;
+		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + pdu_len,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8a852a55db4..a08c127db865 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -195,27 +195,15 @@ cycle_t ftrace_now(int cpu)
 	u64 ts;
 
 	/* Early boot up does not have a buffer yet */
-	if (!global_trace.buffer)
+	if (!global_trace.trace_buffer.buffer)
 		return trace_clock_local();
 
-	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
-	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
 
 	return ts;
 }
 
-/*
- * The max_tr is used to snapshot the global_trace when a maximum
- * latency is reached. Some tracers will use this to store a maximum
- * trace while it continues examining live traces.
- *
- * The buffers for the max_tr are set up the same as the global_trace.
- * When a snapshot is taken, the link list of the max_tr is swapped
- * with the link list of the global_trace and the buffers are reset for
- * the global_trace so the tracing can continue.
- */
-static struct trace_array	max_tr;
-
 int tracing_is_enabled(void)
 {
 	return tracing_is_on();
@@ -339,8 +327,8 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
  */
 void tracing_on(void)
 {
-	if (global_trace.buffer)
-		ring_buffer_record_on(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		ring_buffer_record_on(global_trace.trace_buffer.buffer);
 	/*
 	 * This flag is only looked at when buffers haven't been
 	 * allocated yet. We don't really care about the race
@@ -361,8 +349,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
  */
 void tracing_off(void)
 {
-	if (global_trace.buffer)
-		ring_buffer_record_off(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		ring_buffer_record_off(global_trace.trace_buffer.buffer);
 	/*
 	 * This flag is only looked at when buffers haven't been
 	 * allocated yet. We don't really care about the race
@@ -378,8 +366,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
  */
 int tracing_is_on(void)
 {
-	if (global_trace.buffer)
-		return ring_buffer_record_is_on(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
 	return !global_trace.buffer_disabled;
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -637,13 +625,14 @@ unsigned long __read_mostly	tracing_max_latency;
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
-	struct trace_array_cpu *max_data;
+	struct trace_buffer *trace_buf = &tr->trace_buffer;
+	struct trace_buffer *max_buf = &tr->max_buffer;
+	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
 
-	max_tr.cpu = cpu;
-	max_tr.time_start = data->preempt_timestamp;
+	max_buf->cpu = cpu;
+	max_buf->time_start = data->preempt_timestamp;
 
-	max_data = per_cpu_ptr(max_tr.data, cpu);
 	max_data->saved_latency = tracing_max_latency;
 	max_data->critical_start = data->critical_start;
 	max_data->critical_end = data->critical_end;
@@ -686,9 +675,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&ftrace_max_lock);
 
-	buf = tr->buffer;
-	tr->buffer = max_tr.buffer;
-	max_tr.buffer = buf;
+	buf = tr->trace_buffer.buffer;
+	tr->trace_buffer.buffer = tr->max_buffer.buffer;
+	tr->max_buffer.buffer = buf;
 
 	__update_max_tr(tr, tsk, cpu);
 	arch_spin_unlock(&ftrace_max_lock);
@@ -716,7 +705,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&ftrace_max_lock);
 
-	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
 
 	if (ret == -EBUSY) {
 		/*
@@ -725,7 +714,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		 * the max trace buffer (no one writes directly to it)
 		 * and flag that it failed.
 		 */
-		trace_array_printk(&max_tr, _THIS_IP_,
+		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
 			"Failed to swap buffers due to commit in progress\n");
 	}
 
@@ -742,7 +731,7 @@ static void default_wait_pipe(struct trace_iterator *iter)
 	if (trace_buffer_iter(iter, iter->cpu_file))
 		return;
 
-	ring_buffer_wait(iter->tr->buffer, iter->cpu_file);
+	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
 }
 
 /**
@@ -803,17 +792,19 @@ int register_tracer(struct tracer *type)
 		 * internal tracing to verify that everything is in order.
 		 * If we fail, we do not register this tracer.
 		 */
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
 
 		tr->current_trace = type;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 		if (type->use_max_tr) {
 			/* If we expanded the buffers, make sure the max is expanded too */
 			if (ring_buffer_expanded)
-				ring_buffer_resize(max_tr.buffer, trace_buf_size,
+				ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
 						   RING_BUFFER_ALL_CPUS);
 			type->allocated_snapshot = true;
 		}
+#endif
 
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
@@ -827,16 +818,18 @@ int register_tracer(struct tracer *type)
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 		if (type->use_max_tr) {
 			type->allocated_snapshot = false;
 
 			/* Shrink the max buffer again */
 			if (ring_buffer_expanded)
-				ring_buffer_resize(max_tr.buffer, 1,
+				ring_buffer_resize(tr->max_buffer.buffer, 1,
 						   RING_BUFFER_ALL_CPUS);
 		}
+#endif
 
 		printk(KERN_CONT "PASSED\n");
 	}
@@ -870,9 +863,9 @@ int register_tracer(struct tracer *type)
 	return ret;
 }
 
-void tracing_reset(struct trace_array *tr, int cpu)
+void tracing_reset(struct trace_buffer *buf, int cpu)
 {
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = buf->buffer;
 
 	if (!buffer)
 		return;
@@ -886,9 +879,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
 	ring_buffer_record_enable(buffer);
 }
 
-void tracing_reset_online_cpus(struct trace_array *tr)
+void tracing_reset_online_cpus(struct trace_buffer *buf)
 {
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = buf->buffer;
 	int cpu;
 
 	if (!buffer)
@@ -899,7 +892,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 	/* Make sure all commits have finished */
 	synchronize_sched();
 
-	tr->time_start = ftrace_now(tr->cpu);
+	buf->time_start = ftrace_now(buf->cpu);
 
 	for_each_online_cpu(cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
@@ -909,7 +902,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 
 void tracing_reset_current(int cpu)
 {
-	tracing_reset(&global_trace, cpu);
+	tracing_reset(&global_trace.trace_buffer, cpu);
 }
 
 void tracing_reset_all_online_cpus(void)
@@ -918,7 +911,10 @@ void tracing_reset_all_online_cpus(void)
 
 	mutex_lock(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+		tracing_reset_online_cpus(&tr->max_buffer);
+#endif
 	}
 	mutex_unlock(&trace_types_lock);
 }
@@ -988,13 +984,15 @@ void tracing_start(void)
 	/* Prevent the buffers from switching */
 	arch_spin_lock(&ftrace_max_lock);
 
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
-	buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	buffer = global_trace.max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
+#endif
 
 	arch_spin_unlock(&ftrace_max_lock);
 
@@ -1026,7 +1024,7 @@ static void tracing_start_tr(struct trace_array *tr)
 		goto out;
 	}
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
@@ -1053,13 +1051,15 @@ void tracing_stop(void)
 	/* Prevent the buffers from switching */
 	arch_spin_lock(&ftrace_max_lock);
 
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
-	buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	buffer = global_trace.max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
+#endif
 
 	arch_spin_unlock(&ftrace_max_lock);
 
@@ -1080,7 +1080,7 @@ static void tracing_stop_tr(struct trace_array *tr)
 	if (tr->stop_count++)
 		goto out;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
@@ -1246,7 +1246,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 			  int type, unsigned long len,
 			  unsigned long flags, int pc)
 {
-	*current_rb = ftrace_file->tr->buffer;
+	*current_rb = ftrace_file->tr->trace_buffer.buffer;
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
@@ -1257,7 +1257,7 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
-	*current_rb = global_trace.buffer;
+	*current_rb = global_trace.trace_buffer.buffer;
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
@@ -1296,7 +1296,7 @@ trace_function(struct trace_array *tr,
 	       int pc)
 {
 	struct ftrace_event_call *call = &event_function;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
@@ -1437,7 +1437,7 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc)
 {
-	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
+	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
 }
 
 /**
@@ -1453,7 +1453,8 @@ void trace_dump_stack(void)
 	local_save_flags(flags);
 
 	/* skipping 3 traces, seems to get us at the caller of this function */
-	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
+	__ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3,
+			     preempt_count(), NULL);
 }
 
 static DEFINE_PER_CPU(int, user_stack_count);
@@ -1623,7 +1624,7 @@ void trace_printk_init_buffers(void)
 	 * directly here. If the global_trace.buffer is already
 	 * allocated here, then this was called by module code.
 	 */
-	if (global_trace.buffer)
+	if (global_trace.trace_buffer.buffer)
 		tracing_start_cmdline_record();
 }
 
@@ -1683,7 +1684,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 
 	local_save_flags(flags);
 	size = sizeof(*entry) + sizeof(u32) * len;
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
 					  flags, pc);
 	if (!event)
@@ -1706,27 +1707,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
-int trace_array_printk(struct trace_array *tr,
-		       unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_array_vprintk(tr, ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-
-int trace_array_vprintk(struct trace_array *tr,
-			unsigned long ip, const char *fmt, va_list args)
+static int
+__trace_array_vprintk(struct ring_buffer *buffer,
+		      unsigned long ip, const char *fmt, va_list args)
 {
 	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
 	int len = 0, size, pc;
 	struct print_entry *entry;
 	unsigned long flags;
@@ -1754,7 +1740,6 @@ int trace_array_vprintk(struct trace_array *tr,
 
 	local_save_flags(flags);
 	size = sizeof(*entry) + len + 1;
-	buffer = tr->buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					  flags, pc);
 	if (!event)
@@ -1775,6 +1760,42 @@ int trace_array_vprintk(struct trace_array *tr,
 	return len;
 }
 
+int trace_array_vprintk(struct trace_array *tr,
+			unsigned long ip, const char *fmt, va_list args)
+{
+	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+
+int trace_array_printk(struct trace_array *tr,
+		       unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_array_vprintk(tr, ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
+int trace_array_printk_buf(struct ring_buffer *buffer,
+			   unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1800,7 +1821,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
 					 lost_events);
 
 	if (event) {
@@ -1815,7 +1836,7 @@ static struct trace_entry *
 __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 		  unsigned long *missing_events, u64 *ent_ts)
 {
-	struct ring_buffer *buffer = iter->tr->buffer;
+	struct ring_buffer *buffer = iter->trace_buffer->buffer;
 	struct trace_entry *ent, *next = NULL;
 	unsigned long lost_events = 0, next_lost = 0;
 	int cpu_file = iter->cpu_file;
@@ -1892,7 +1913,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
 
 static void trace_consume(struct trace_iterator *iter)
 {
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
 			    &iter->lost_events);
 }
 
@@ -1925,13 +1946,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 
 void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
-	struct trace_array *tr = iter->tr;
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter;
 	unsigned long entries = 0;
 	u64 ts;
 
-	per_cpu_ptr(tr->data, cpu)->skipped_entries = 0;
+	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
 
 	buf_iter = trace_buffer_iter(iter, cpu);
 	if (!buf_iter)
@@ -1945,13 +1965,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 	 * by the timestamp being before the start of the buffer.
 	 */
 	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
-		if (ts >= iter->tr->time_start)
+		if (ts >= iter->trace_buffer->time_start)
 			break;
 		entries++;
 		ring_buffer_read(buf_iter, NULL);
 	}
 
-	per_cpu_ptr(tr->data, cpu)->skipped_entries = entries;
+	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
 }
 
 /*
@@ -1978,8 +1998,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		*iter->trace = *tr->current_trace;
 	mutex_unlock(&trace_types_lock);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (iter->snapshot && iter->trace->use_max_tr)
 		return ERR_PTR(-EBUSY);
+#endif
 
 	if (!iter->snapshot)
 		atomic_inc(&trace_record_cmdline_disabled);
@@ -2021,17 +2043,21 @@ static void s_stop(struct seq_file *m, void *p)
 {
 	struct trace_iterator *iter = m->private;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (iter->snapshot && iter->trace->use_max_tr)
 		return;
+#endif
 
 	if (!iter->snapshot)
 		atomic_dec(&trace_record_cmdline_disabled);
+
 	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 }
 
 static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+get_total_entries(struct trace_buffer *buf,
+		  unsigned long *total, unsigned long *entries)
 {
 	unsigned long count;
 	int cpu;
@@ -2040,19 +2066,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
 	*entries = 0;
 
 	for_each_tracing_cpu(cpu) {
-		count = ring_buffer_entries_cpu(tr->buffer, cpu);
+		count = ring_buffer_entries_cpu(buf->buffer, cpu);
 		/*
 		 * If this buffer has skipped entries, then we hold all
 		 * entries for the trace and we need to ignore the
 		 * ones before the time stamp.
 		 */
-		if (per_cpu_ptr(tr->data, cpu)->skipped_entries) {
-			count -= per_cpu_ptr(tr->data, cpu)->skipped_entries;
+		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
 			/* total is the same as the entries */
 			*total += count;
 		} else
 			*total += count +
-				ring_buffer_overrun_cpu(tr->buffer, cpu);
+				ring_buffer_overrun_cpu(buf->buffer, cpu);
 		*entries += count;
 	}
 }
@@ -2069,27 +2095,27 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
 
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
 {
 	unsigned long total;
 	unsigned long entries;
 
-	get_total_entries(tr, &total, &entries);
+	get_total_entries(buf, &total, &entries);
 	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
 		   entries, total, num_online_cpus());
 	seq_puts(m, "#\n");
 }
 
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
 {
-	print_event_info(tr, m);
+	print_event_info(buf, m);
 	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
 	seq_puts(m, "#              | |       |          |         |\n");
 }
 
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
 {
-	print_event_info(tr, m);
+	print_event_info(buf, m);
 	seq_puts(m, "#                              _-----=> irqs-off\n");
 	seq_puts(m, "#                             / _----=> need-resched\n");
 	seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
@@ -2103,8 +2129,8 @@ void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_array *tr = iter->tr;
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, tr->cpu);
+	struct trace_buffer *buf = iter->trace_buffer;
+	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
 	struct tracer *type = iter->trace;
 	unsigned long entries;
 	unsigned long total;
@@ -2112,7 +2138,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 
 	name = type->name;
 
-	get_total_entries(tr, &total, &entries);
+	get_total_entries(buf, &total, &entries);
 
 	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
 		   name, UTS_RELEASE);
@@ -2123,7 +2149,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   nsecs_to_usecs(data->saved_latency),
 		   entries,
 		   total,
-		   tr->cpu,
+		   buf->cpu,
 #if defined(CONFIG_PREEMPT_NONE)
 		   "server",
 #elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2174,7 +2200,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	if (cpumask_test_cpu(iter->cpu, iter->started))
 		return;
 
-	if (per_cpu_ptr(iter->tr->data, iter->cpu)->skipped_entries)
+	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
@@ -2304,7 +2330,7 @@ int trace_empty(struct trace_iterator *iter)
 			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
-			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
 				return 0;
 		}
 		return 1;
@@ -2316,7 +2342,7 @@ int trace_empty(struct trace_iterator *iter)
 			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
-			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
 				return 0;
 		}
 	}
@@ -2394,9 +2420,9 @@ void trace_default_header(struct seq_file *m)
 	} else {
 		if (!(trace_flags & TRACE_ITER_VERBOSE)) {
 			if (trace_flags & TRACE_ITER_IRQ_INFO)
-				print_func_help_header_irq(iter->tr, m);
+				print_func_help_header_irq(iter->trace_buffer, m);
 			else
-				print_func_help_header(iter->tr, m);
+				print_func_help_header(iter->trace_buffer, m);
 		}
 	}
 }
@@ -2515,11 +2541,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
+	iter->tr = tr;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
 	/* Currently only the top directory has a snapshot */
 	if (tr->current_trace->print_max || snapshot)
-		iter->tr = &max_tr;
+		iter->trace_buffer = &tr->max_buffer;
 	else
-		iter->tr = tr;
+#endif
+		iter->trace_buffer = &tr->trace_buffer;
 	iter->snapshot = snapshot;
 	iter->pos = -1;
 	mutex_init(&iter->mutex);
@@ -2530,7 +2560,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 		iter->trace->open(iter);
 
 	/* Annotate start of buffers if we had overruns */
-	if (ring_buffer_overruns(iter->tr->buffer))
+	if (ring_buffer_overruns(iter->trace_buffer->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2544,7 +2574,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->tr->buffer, cpu);
+				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
 		}
 		ring_buffer_read_prepare_sync();
 		for_each_tracing_cpu(cpu) {
@@ -2554,7 +2584,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->tr->buffer, cpu);
+			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
 		ring_buffer_read_prepare_sync();
 		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
@@ -2593,12 +2623,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 		return 0;
 
 	iter = m->private;
-
-	/* Only the global tracer has a matching max_tr */
-	if (iter->tr == &max_tr)
-		tr = &global_trace;
-	else
-		tr = iter->tr;
+	tr = iter->tr;
 
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
@@ -2634,9 +2659,9 @@ static int tracing_open(struct inode *inode, struct file *file)
 		struct trace_array *tr = tc->tr;
 
 		if (tc->cpu == RING_BUFFER_ALL_CPUS)
-			tracing_reset_online_cpus(tr);
+			tracing_reset_online_cpus(&tr->trace_buffer);
 		else
-			tracing_reset(tr, tc->cpu);
+			tracing_reset(&tr->trace_buffer, tc->cpu);
 	}
 
 	if (file->f_mode & FMODE_READ) {
@@ -2805,13 +2830,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		 */
 		if (cpumask_test_cpu(cpu, tracing_cpumask) &&
 				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_inc(&per_cpu_ptr(tr->data, cpu)->disabled);
-			ring_buffer_record_disable_cpu(tr->buffer, cpu);
+			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
 		}
 		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
 				cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_dec(&per_cpu_ptr(tr->data, cpu)->disabled);
-			ring_buffer_record_enable_cpu(tr->buffer, cpu);
+			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
 		}
 	}
 	arch_spin_unlock(&ftrace_max_lock);
@@ -2930,9 +2955,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 		trace_event_enable_cmd_record(enabled);
 
 	if (mask == TRACE_ITER_OVERWRITE) {
-		ring_buffer_change_overwrite(global_trace.buffer, enabled);
+		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
-		ring_buffer_change_overwrite(max_tr.buffer, enabled);
+		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
 #endif
 	}
 
@@ -3116,42 +3141,44 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 
 int tracer_init(struct tracer *t, struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 	return t->init(tr);
 }
 
-static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
 {
 	int cpu;
 	for_each_tracing_cpu(cpu)
-		per_cpu_ptr(tr->data, cpu)->entries = val;
+		per_cpu_ptr(buf->data, cpu)->entries = val;
 }
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 /* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_array *tr,
-					struct trace_array *size_tr, int cpu_id)
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+					struct trace_buffer *size_buf, int cpu_id)
 {
 	int cpu, ret = 0;
 
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
-			ret = ring_buffer_resize(tr->buffer,
-				 per_cpu_ptr(size_tr->data, cpu)->entries, cpu);
+			ret = ring_buffer_resize(trace_buf->buffer,
+				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
 			if (ret < 0)
 				break;
-			per_cpu_ptr(tr->data, cpu)->entries =
-				per_cpu_ptr(size_tr->data, cpu)->entries;
+			per_cpu_ptr(trace_buf->data, cpu)->entries =
+				per_cpu_ptr(size_buf->data, cpu)->entries;
 		}
 	} else {
-		ret = ring_buffer_resize(tr->buffer,
-				 per_cpu_ptr(size_tr->data, cpu_id)->entries, cpu_id);
+		ret = ring_buffer_resize(trace_buf->buffer,
+				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
 		if (ret == 0)
-			per_cpu_ptr(tr->data, cpu_id)->entries =
-				per_cpu_ptr(size_tr->data, cpu_id)->entries;
+			per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+				per_cpu_ptr(size_buf->data, cpu_id)->entries;
 	}
 
 	return ret;
 }
+#endif /* CONFIG_TRACER_MAX_TRACE */
 
 static int __tracing_resize_ring_buffer(struct trace_array *tr,
 					unsigned long size, int cpu)
@@ -3166,20 +3193,22 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	ring_buffer_expanded = 1;
 
 	/* May be called before buffers are initialized */
-	if (!tr->buffer)
+	if (!tr->trace_buffer.buffer)
 		return 0;
 
-	ret = ring_buffer_resize(tr->buffer, size, cpu);
+	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
 	if (ret < 0)
 		return ret;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
 	    !tr->current_trace->use_max_tr)
 		goto out;
 
-	ret = ring_buffer_resize(max_tr.buffer, size, cpu);
+	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
 	if (ret < 0) {
-		int r = resize_buffer_duplicate_size(tr, tr, cpu);
+		int r = resize_buffer_duplicate_size(&tr->trace_buffer,
+						     &tr->trace_buffer, cpu);
 		if (r < 0) {
 			/*
 			 * AARGH! We are left with different
@@ -3202,15 +3231,17 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	}
 
 	if (cpu == RING_BUFFER_ALL_CPUS)
-		set_buffer_entries(&max_tr, size);
+		set_buffer_entries(&tr->max_buffer, size);
 	else
-		per_cpu_ptr(max_tr.data, cpu)->entries = size;
+		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
 
  out:
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
 	if (cpu == RING_BUFFER_ALL_CPUS)
-		set_buffer_entries(tr, size);
+		set_buffer_entries(&tr->trace_buffer, size);
 	else
-		per_cpu_ptr(tr->data, cpu)->entries = size;
+		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
 
 	return ret;
 }
@@ -3277,7 +3308,9 @@ static int tracing_set_tracer(const char *buf)
 	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
 	bool had_max_tr;
+#endif
 	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
@@ -3308,7 +3341,10 @@ static int tracing_set_tracer(const char *buf)
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	had_max_tr = tr->current_trace->allocated_snapshot;
+
+	/* Current trace needs to be nop_trace before synchronize_sched */
 	tr->current_trace = &nop_trace;
 
 	if (had_max_tr && !t->use_max_tr) {
@@ -3325,22 +3361,28 @@ static int tracing_set_tracer(const char *buf)
 		 * The max_tr ring buffer has some state (e.g. ring->clock) and
 		 * we want preserve it.
 		 */
-		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
-		set_buffer_entries(&max_tr, 1);
-		tracing_reset_online_cpus(&max_tr);
+		ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+		set_buffer_entries(&tr->max_buffer, 1);
+		tracing_reset_online_cpus(&tr->max_buffer);
 		tr->current_trace->allocated_snapshot = false;
 	}
+#else
+	tr->current_trace = &nop_trace;
+#endif
 	destroy_trace_option_files(topts);
 
 	topts = create_trace_option_files(tr, t);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
 		/* we need to make per cpu buffer sizes equivalent */
-		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
+		ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer,
 						   RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
 			goto out;
 		t->allocated_snapshot = true;
 	}
+#endif
 
 	if (t->init) {
 		ret = tracer_init(t, tr);
@@ -3468,6 +3510,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->cpu_file = tc->cpu;
 	iter->tr = tc->tr;
+	iter->trace_buffer = &tc->tr->trace_buffer;
 	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
@@ -3518,7 +3561,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
 		 */
 		return POLLIN | POLLRDNORM;
 	else
-		return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file,
+		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
 					     filp, poll_table);
 }
 
@@ -3857,8 +3900,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		for_each_tracing_cpu(cpu) {
 			/* fill in the size from first enabled cpu */
 			if (size == 0)
-				size = per_cpu_ptr(tr->data, cpu)->entries;
-			if (size != per_cpu_ptr(tr->data, cpu)->entries) {
+				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
+			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
 				buf_size_same = 0;
 				break;
 			}
@@ -3874,7 +3917,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		} else
 			r = sprintf(buf, "X\n");
 	} else
-		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->data, tc->cpu)->entries >> 10);
+		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -3921,7 +3964,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
-		size += per_cpu_ptr(tr->data, cpu)->entries >> 10;
+		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
 		if (!ring_buffer_expanded)
 			expanded_size += trace_buf_size >> 10;
 	}
@@ -4026,7 +4069,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* possible \n added */
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					  irq_flags, preempt_count());
 	if (!event) {
@@ -4111,16 +4154,19 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 
 	tr->clock_id = i;
 
-	ring_buffer_set_clock(tr->buffer, trace_clocks[i].func);
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && max_tr.buffer)
-		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
 
 	/*
 	 * New clock may not be consistent with the previous clock.
 	 * Reset the buffer so that it doesn't have incomparable timestamps.
 	 */
-	tracing_reset_online_cpus(&global_trace);
-	tracing_reset_online_cpus(&max_tr);
+	tracing_reset_online_cpus(&global_trace.trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+	tracing_reset_online_cpus(&global_trace.max_buffer);
+#endif
 
 	mutex_unlock(&trace_types_lock);
 
@@ -4160,6 +4206,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
 			return -ENOMEM;
 		}
 		iter->tr = tc->tr;
+		iter->trace_buffer = &tc->tr->max_buffer;
 		m->private = iter;
 		file->private_data = m;
 	}
@@ -4196,18 +4243,18 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	case 0:
 		if (tr->current_trace->allocated_snapshot) {
 			/* free spare buffer */
-			ring_buffer_resize(max_tr.buffer, 1,
+			ring_buffer_resize(tr->max_buffer.buffer, 1,
 					   RING_BUFFER_ALL_CPUS);
-			set_buffer_entries(&max_tr, 1);
-			tracing_reset_online_cpus(&max_tr);
+			set_buffer_entries(&tr->max_buffer, 1);
+			tracing_reset_online_cpus(&tr->max_buffer);
 			tr->current_trace->allocated_snapshot = false;
 		}
 		break;
 	case 1:
 		if (!tr->current_trace->allocated_snapshot) {
 			/* allocate spare buffer */
-			ret = resize_buffer_duplicate_size(&max_tr,
-					&global_trace, RING_BUFFER_ALL_CPUS);
+			ret = resize_buffer_duplicate_size(&tr->max_buffer,
+					&tr->trace_buffer, RING_BUFFER_ALL_CPUS);
 			if (ret < 0)
 				break;
 			tr->current_trace->allocated_snapshot = true;
@@ -4220,7 +4267,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		break;
 	default:
 		if (tr->current_trace->allocated_snapshot)
-			tracing_reset_online_cpus(&max_tr);
+			tracing_reset_online_cpus(&tr->max_buffer);
 		break;
 	}
 
@@ -4338,6 +4385,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	info->iter.tr		= tr;
 	info->iter.cpu_file	= tc->cpu;
 	info->iter.trace	= tr->current_trace;
+	info->iter.trace_buffer = &tr->trace_buffer;
 	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
 	info->read		= (unsigned int)-1;
@@ -4369,7 +4417,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return 0;
 
 	if (!info->spare)
-		info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file);
+		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+							  iter->cpu_file);
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -4379,7 +4428,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
  again:
 	trace_access_lock(iter->cpu_file);
-	ret = ring_buffer_read_page(iter->tr->buffer,
+	ret = ring_buffer_read_page(iter->trace_buffer->buffer,
 				    &info->spare,
 				    count,
 				    iter->cpu_file, 0);
@@ -4421,7 +4470,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 	struct trace_iterator *iter = &info->iter;
 
 	if (info->spare)
-		ring_buffer_free_read_page(iter->tr->buffer, info->spare);
+		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
 	kfree(info);
 
 	return 0;
@@ -4521,7 +4570,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
  again:
 	trace_access_lock(iter->cpu_file);
-	entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
+	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 
 	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
@@ -4532,7 +4581,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 
 		ref->ref = 1;
-		ref->buffer = iter->tr->buffer;
+		ref->buffer = iter->trace_buffer->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
 		if (!ref->page) {
 			kfree(ref);
@@ -4564,7 +4613,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
 
-		entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
+		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 	}
 
 	trace_access_unlock(iter->cpu_file);
@@ -4605,6 +4654,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 {
 	struct trace_cpu *tc = filp->private_data;
 	struct trace_array *tr = tc->tr;
+	struct trace_buffer *trace_buf = &tr->trace_buffer;
 	struct trace_seq *s;
 	unsigned long cnt;
 	unsigned long long t;
@@ -4617,41 +4667,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 
 	trace_seq_init(s);
 
-	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "entries: %ld\n", cnt);
 
-	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "overrun: %ld\n", cnt);
 
-	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
 
-	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "bytes: %ld\n", cnt);
 
 	if (trace_clocks[trace_clock_id].in_ns) {
 		/* local or global for trace_clock */
-		t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
 		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
 								t, usec_rem);
 
-		t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
 		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
 	} else {
 		/* counter or tsc mode for trace_clock */
 		trace_seq_printf(s, "oldest event ts: %llu\n",
-				ring_buffer_oldest_event_ts(tr->buffer, cpu));
+				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 
 		trace_seq_printf(s, "now ts: %llu\n",
-				ring_buffer_time_stamp(tr->buffer, cpu));
+				ring_buffer_time_stamp(trace_buf->buffer, cpu));
 	}
 
-	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "dropped events: %ld\n", cnt);
 
-	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "read events: %ld\n", cnt);
 
 	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4754,7 +4804,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 static void
 tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
+	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
 	struct dentry *d_cpu;
 	char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5038,7 +5088,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	char buf[64];
 	int r;
 
@@ -5057,7 +5107,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	unsigned long val;
 	int ret;
 
@@ -5129,18 +5179,18 @@ static int new_instance_create(const char *name)
 
 	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
 
-	tr->buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
-	if (!tr->buffer)
+	tr->trace_buffer.buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
+	if (!tr->trace_buffer.buffer)
 		goto out_free_tr;
 
-	tr->data = alloc_percpu(struct trace_array_cpu);
-	if (!tr->data)
+	tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!tr->trace_buffer.data)
 		goto out_free_tr;
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(tr->data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(tr->data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(tr->data, i)->trace_cpu.tr = tr;
+		memset(per_cpu_ptr(tr->trace_buffer.data, i), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.tr = tr;
 	}
 
 	/* Holder for file callbacks */
@@ -5164,8 +5214,8 @@ static int new_instance_create(const char *name)
 	return 0;
 
  out_free_tr:
-	if (tr->buffer)
-		ring_buffer_free(tr->buffer);
+	if (tr->trace_buffer.buffer)
+		ring_buffer_free(tr->trace_buffer.buffer);
 	kfree(tr->name);
 	kfree(tr);
 
@@ -5198,8 +5248,8 @@ static int instance_delete(const char *name)
 
 	event_trace_del_tracer(tr);
 	debugfs_remove_recursive(tr->dir);
-	free_percpu(tr->data);
-	ring_buffer_free(tr->buffer);
+	free_percpu(tr->trace_buffer.data);
+	ring_buffer_free(tr->trace_buffer.buffer);
 
 	kfree(tr->name);
 	kfree(tr);
@@ -5439,6 +5489,7 @@ void trace_init_global_iter(struct trace_iterator *iter)
 	iter->tr = &global_trace;
 	iter->trace = iter->tr->current_trace;
 	iter->cpu_file = RING_BUFFER_ALL_CPUS;
+	iter->trace_buffer = &global_trace.trace_buffer;
 }
 
 static void
@@ -5476,7 +5527,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5544,7 +5595,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		trace_flags |= old_userobj;
 
 		for_each_tracing_cpu(cpu) {
-			atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+			atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 		}
 		tracing_on();
 	}
@@ -5594,58 +5645,59 @@ __init static int tracer_alloc_buffers(void)
 	raw_spin_lock_init(&global_trace.start_lock);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
-	if (!global_trace.buffer) {
+	global_trace.trace_buffer.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
+	if (!global_trace.trace_buffer.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
 
-	global_trace.data = alloc_percpu(struct trace_array_cpu);
+	global_trace.trace_buffer.data = alloc_percpu(struct trace_array_cpu);
 
-	if (!global_trace.data) {
+	if (!global_trace.trace_buffer.data) {
 		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(global_trace.data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(global_trace.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(global_trace.data, i)->trace_cpu.tr = &global_trace;
+		memset(per_cpu_ptr(global_trace.trace_buffer.data, i), 0,
+		       sizeof(struct trace_array_cpu));
+		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.tr = &global_trace;
 	}
 
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.data = alloc_percpu(struct trace_array_cpu);
-	if (!max_tr.data) {
+	global_trace.max_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!global_trace.max_buffer.data) {
 		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
-	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
-	raw_spin_lock_init(&max_tr.start_lock);
-	if (!max_tr.buffer) {
+	global_trace.max_buffer.buffer = ring_buffer_alloc(1, rb_flags);
+	if (!global_trace.max_buffer.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
-		ring_buffer_free(global_trace.buffer);
+		ring_buffer_free(global_trace.trace_buffer.buffer);
 		goto out_free_cpumask;
 	}
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(max_tr.data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(max_tr.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(max_tr.data, i)->trace_cpu.tr = &max_tr;
+		memset(per_cpu_ptr(global_trace.max_buffer.data, i), 0,
+		       sizeof(struct trace_array_cpu));
+		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.tr = &global_trace;
 	}
 #endif
 
 	/* Allocate the first page for all buffers */
-	set_buffer_entries(&global_trace,
-			   ring_buffer_size(global_trace.buffer, 0));
+	set_buffer_entries(&global_trace.trace_buffer,
+			   ring_buffer_size(global_trace.trace_buffer.buffer, 0));
 #ifdef CONFIG_TRACER_MAX_TRACE
-	set_buffer_entries(&max_tr, 1);
+	set_buffer_entries(&global_trace.max_buffer, 1);
 #endif
 
 	trace_init_cmdlines();
@@ -5682,8 +5734,10 @@ __init static int tracer_alloc_buffers(void)
 	return 0;
 
 out_free_cpumask:
-	free_percpu(global_trace.data);
-	free_percpu(max_tr.data);
+	free_percpu(global_trace.trace_buffer.data);
+#ifdef CONFIG_TRACER_MAX_TRACE
+	free_percpu(global_trace.max_buffer.data);
+#endif
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa60b2977524..986834f1f4dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -167,16 +167,37 @@ struct trace_array_cpu {
 
 struct tracer;
 
+struct trace_buffer {
+	struct trace_array		*tr;
+	struct ring_buffer		*buffer;
+	struct trace_array_cpu __percpu	*data;
+	cycle_t				time_start;
+	int				cpu;
+};
+
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
  * They have on/off state as well:
  */
 struct trace_array {
-	struct ring_buffer	*buffer;
 	struct list_head	list;
 	char			*name;
-	int			cpu;
+	struct trace_buffer	trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	/*
+	 * The max_buffer is used to snapshot the trace when a maximum
+	 * latency is reached, or when the user initiates a snapshot.
+	 * Some tracers will use this to store a maximum trace while
+	 * it continues examining live traces.
+	 *
+	 * The buffers for the max_buffer are set up the same as the trace_buffer
+	 * When a snapshot is taken, the buffer of the max_buffer is swapped
+	 * with the buffer of the trace_buffer and the buffers are reset for
+	 * the trace_buffer so the tracing can continue.
+	 */
+	struct trace_buffer	max_buffer;
+#endif
 	int			buffer_disabled;
 	struct trace_cpu	trace_cpu;	/* place holder */
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -189,7 +210,6 @@ struct trace_array {
 	int			clock_id;
 	struct tracer		*current_trace;
 	unsigned int		flags;
-	cycle_t			time_start;
 	raw_spinlock_t		start_lock;
 	struct dentry		*dir;
 	struct dentry		*options;
@@ -198,7 +218,6 @@ struct trace_array {
 	struct list_head	systems;
 	struct list_head	events;
 	struct task_struct	*waiter;
-	struct trace_array_cpu __percpu	*data;
 };
 
 enum {
@@ -345,9 +364,11 @@ struct tracer {
 	struct tracer		*next;
 	struct tracer_flags	*flags;
 	bool			print_max;
+	bool			enabled;
+#ifdef CONFIG_TRACER_MAX_TRACE
 	bool			use_max_tr;
 	bool			allocated_snapshot;
-	bool			enabled;
+#endif
 };
 
 
@@ -493,8 +514,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
 
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
-void tracing_reset(struct trace_array *tr, int cpu);
-void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset(struct trace_buffer *buf, int cpu);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -674,6 +695,8 @@ trace_array_vprintk(struct trace_array *tr,
 		    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+			   unsigned long ip, const char *fmt, ...);
 void trace_printk_seq(struct trace_seq *s);
 enum print_line_t print_trace_line(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9d73861efc6a..e467c0c7bdd5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
 static int function_trace_init(struct trace_array *tr)
 {
 	func_trace = tr;
-	tr->cpu = get_cpu();
+	tr->trace_buffer.cpu = get_cpu();
 	put_cpu();
 
 	tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
 
 static void function_trace_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 }
 
 /* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 		goto out;
 
 	cpu = smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	if (!atomic_read(&data->disabled)) {
 		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	 */
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1)) {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ca986d61a282..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
 {
 	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ent_entry *entry;
 
 	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
 {
 	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ret_entry *entry;
 
 	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
 					    NULL, NULL);
-			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
 						 NULL, NULL);
 		}
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9b52f9cf7a0d..5aa40ab72b57 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -121,7 +121,7 @@ static int func_prolog_dec(struct trace_array *tr,
 	if (!irqs_disabled_flags(*flags))
 		return 0;
 
-	*data = per_cpu_ptr(tr->data, cpu);
+	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 
 	if (likely(disabled == 1))
@@ -175,7 +175,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
 		per_cpu(tracing_cpu, cpu) = 0;
 
 	tracing_max_latency = 0;
-	tracing_reset_online_cpus(irqsoff_trace);
+	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
 
 	return start_irqsoff_tracer(irqsoff_trace, set);
 }
@@ -380,7 +380,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (per_cpu(tracing_cpu, cpu))
 		return;
 
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 
 	if (unlikely(!data) || atomic_read(&data->disabled))
 		return;
@@ -418,7 +418,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (!tracer_enabled)
 		return;
 
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 
 	if (unlikely(!data) ||
 	    !data->critical_start || atomic_read(&data->disabled))
@@ -568,7 +568,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 
 	if (start_irqsoff_tracer(tr, is_graph()))
 		printk(KERN_ERR "failed to start irqsoff tracer\n");
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 349f6941e8f2..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags;
@@ -46,14 +46,14 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.tr->buffer, cpu);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
 			ring_buffer_read_start(iter.buffer_iter[cpu]);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
 		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
@@ -83,7 +83,7 @@ out:
 	trace_flags = old_userobj;
 
 	for_each_tracing_cpu(cpu) {
-		atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
 
 	for_each_tracing_cpu(cpu)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2472f6f76b50..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
 	overrun_detected = false;
 	prev_overruns = 0;
 
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 }
 
 static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
 	unsigned long cnt = atomic_xchg(&dropped_count, 0);
-	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
 
 	if (over > prev_overruns)
 		cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct mmiotrace_rw *rw)
 {
 	struct ftrace_event_call *call = &event_mmiotrace_rw;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
 	int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, smp_processor_id());
+	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
 	__trace_mmiotrace_rw(tr, data, rw);
 }
 
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct mmiotrace_map *map)
 {
 	struct ftrace_event_call *call = &event_mmiotrace_map;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
 	int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 	struct trace_array_cpu *data;
 
 	preempt_disable();
-	data = per_cpu_ptr(tr->data, smp_processor_id());
+	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
 	__trace_mmiotrace_map(tr, data, map);
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index aa92ac322ba2..2edc7220d017 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -643,7 +643,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
 {
 	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
 	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
-	unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
 	unsigned long long rel_ts = next_ts - iter->ts;
 	struct trace_seq *s = &iter->seq;
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 1ffe39abd6fc..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   unsigned long flags, int pc)
 {
 	struct ftrace_event_call *call = &event_context_switch;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(ctx_trace->data, cpu);
+	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
 					  sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(ctx_trace->data, cpu);
+	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f9ceb75a95b7..c16f8cd63c3c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -89,7 +89,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
 	if (cpu != wakeup_current_cpu)
 		goto out_enable;
 
-	*data = per_cpu_ptr(tr->data, cpu);
+	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
@@ -353,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore,
 
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
-	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 	if (likely(disabled != 1))
 		goto out;
 
@@ -365,7 +365,7 @@ probe_wakeup_sched_switch(void *ignore,
 		goto out_unlock;
 
 	/* The task we are waiting for is waking up */
-	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
+	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 
 	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +387,7 @@ out_unlock:
 	arch_spin_unlock(&wakeup_lock);
 	local_irq_restore(flags);
 out:
-	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 
 static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +405,7 @@ static void wakeup_reset(struct trace_array *tr)
 {
 	unsigned long flags;
 
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 
 	local_irq_save(flags);
 	arch_spin_lock(&wakeup_lock);
@@ -435,7 +435,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 		return;
 
 	pc = preempt_count();
-	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
@@ -458,7 +458,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
+	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
 
@@ -472,7 +472,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
 out:
-	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..8672c40cb153 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	return 0;
 }
 
-static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
 {
 	struct ring_buffer_event *event;
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
+	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
  * Test the trace buffer to see if all the elements
  * are still sane.
  */
-static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
 	unsigned long flags, cnt = 0;
 	int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	local_irq_save(flags);
 	arch_spin_lock(&ftrace_max_lock);
 
-	cnt = ring_buffer_entries(tr->buffer);
+	cnt = ring_buffer_entries(buf->buffer);
 
 	/*
 	 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	 */
 	tracing_off();
 	for_each_possible_cpu(cpu) {
-		ret = trace_test_buffer_cpu(tr, cpu);
+		ret = trace_test_buffer_cpu(buf, cpu);
 		if (ret)
 			break;
 	}
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	msleep(100);
 
 	/* we should have nothing in the buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	if (ret)
 		goto out;
 
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	tracing_start();
 
 	/* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -737,7 +737,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	 * Simulate the init() callback but we attach a watchdog callback
 	 * to detect and recover from possible hangs
 	 */
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 	set_graph_array(tr);
 	ret = register_ftrace_graph(&trace_graph_return,
 				    &trace_graph_entry_watchdog);
@@ -760,7 +760,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	tracing_stop();
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 
 	trace->reset(tr);
 	tracing_start();
@@ -815,9 +815,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -877,9 +877,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -943,11 +943,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (ret)
 		goto out;
 
-	ret = trace_test_buffer(&max_tr, &count);
+	ret = trace_test_buffer(&tr->max_buffer, &count);
 	if (ret)
 		goto out;
 
@@ -973,11 +973,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (ret)
 		goto out;
 
-	ret = trace_test_buffer(&max_tr, &count);
+	ret = trace_test_buffer(&tr->max_buffer, &count);
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1084,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	printk("ret = %d\n", ret);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 
 
 	trace->reset(tr);
@@ -1126,7 +1126,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	/* stop the tracing. */
 	tracing_stop();
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 1cd37ffb4093..68f3f344be65 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->enter_event->event.type, size, 0, 0);
 	if (!event)
@@ -355,7 +355,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 	if (!event)
-- 
cgit v1.2.3


From f1affcaaa861f27752a769f889bf1486ebd301fe Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 14:35:11 -0500
Subject: tracing: Add snapshot in the per_cpu trace directories

Add the snapshot file into the per_cpu tracing directories to allow
them to be read for an individual cpu. This also allows to clear
an individual cpu from the snapshot buffer.

If the kernel allows it (CONFIG_RING_BUFFER_ALLOW_SWAP is set), then
echoing in '1' into one of the per_cpu snapshot files will do an
individual cpu buffer swap instead of the entire file.

Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 66 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a08c127db865..303932688964 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2436,6 +2436,31 @@ static void test_ftrace_alive(struct seq_file *m)
 }
 
 #ifdef CONFIG_TRACER_MAX_TRACE
+static void show_snapshot_main_help(struct seq_file *m)
+{
+	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
+	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+	seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+	seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+	seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n");
+#else
+	seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
+	seq_printf(m, "#                     Must use main snapshot file to allocate.\n");
+#endif
+	seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
+	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+	seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+
 static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
 {
 	if (iter->trace->allocated_snapshot)
@@ -2444,12 +2469,10 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
 		seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
 
 	seq_printf(m, "# Snapshot commands:\n");
-	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
-	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
-	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
-	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
-	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
-	seq_printf(m, "#                       is not a '0' or '1')\n");
+	if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+		show_snapshot_main_help(m);
+	else
+		show_snapshot_percpu_help(m);
 }
 #else
 /* Should never be called */
@@ -4207,6 +4230,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
 		}
 		iter->tr = tc->tr;
 		iter->trace_buffer = &tc->tr->max_buffer;
+		iter->cpu_file = tc->cpu;
 		m->private = iter;
 		file->private_data = m;
 	}
@@ -4241,6 +4265,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	switch (val) {
 	case 0:
+		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+			ret = -EINVAL;
+			break;
+		}
 		if (tr->current_trace->allocated_snapshot) {
 			/* free spare buffer */
 			ring_buffer_resize(tr->max_buffer.buffer, 1,
@@ -4251,6 +4279,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		}
 		break;
 	case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+			ret = -EINVAL;
+			break;
+		}
+#endif
 		if (!tr->current_trace->allocated_snapshot) {
 			/* allocate spare buffer */
 			ret = resize_buffer_duplicate_size(&tr->max_buffer,
@@ -4259,15 +4294,21 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 				break;
 			tr->current_trace->allocated_snapshot = true;
 		}
-
 		local_irq_disable();
 		/* Now, we're going to swap */
-		update_max_tr(&global_trace, current, smp_processor_id());
+		if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+			update_max_tr(&global_trace, current, smp_processor_id());
+		else
+			update_max_tr_single(&global_trace, current, iter->cpu_file);
 		local_irq_enable();
 		break;
 	default:
-		if (tr->current_trace->allocated_snapshot)
-			tracing_reset_online_cpus(&tr->max_buffer);
+		if (tr->current_trace->allocated_snapshot) {
+			if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+				tracing_reset_online_cpus(&tr->max_buffer);
+			else
+				tracing_reset(&tr->max_buffer, iter->cpu_file);
+		}
 		break;
 	}
 
@@ -4835,6 +4876,11 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 
 	trace_create_file("buffer_size_kb", 0444, d_cpu,
 			(void *)&data->trace_cpu, &tracing_entries_fops);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+	trace_create_file("snapshot", 0644, d_cpu,
+			  (void *)&data->trace_cpu, &snapshot_fops);
+#endif
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v1.2.3


From 0b85ffc293044393623059eda9904a7d5b644e36 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 14:50:23 -0500
Subject: tracing: Add config option to allow snapshot to swap per cpu

When the preempt or irq latency tracers are enabled, they require
the ring buffer to be able to swap the per cpu sub buffers between
two main buffers. This adds a slight overhead to tracing as the
trace recording needs to perform some checks to synchronize
between recording and swaps that might be happening on other CPUs.

The config RING_BUFFER_ALLOW_SWAP is set when a user of the ring
buffer needs the "swap cpu" feature, otherwise the extra checks
are not implemented and removed from the tracing overhead.

The snapshot feature will swap per CPU if the RING_BUFFER_ALLOW_SWAP
config is set. But that only gets set by things like OPROFILE
and the irqs and preempt latency tracers.

This config is added to let the user decide to include this feature
with the snapshot agnostic from whether or not another user of
the ring buffer sets this config.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 590a27fc212f..f78eab251897 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -192,6 +192,7 @@ config IRQSOFF_TRACER
 	select TRACER_MAX_TRACE
 	select RING_BUFFER_ALLOW_SWAP
 	select TRACER_SNAPSHOT
+	select TRACER_SNAPSHOT_PER_CPU_SWAP
 	help
 	  This option measures the time spent in irqs-off critical
 	  sections, with microsecond accuracy.
@@ -215,6 +216,7 @@ config PREEMPT_TRACER
 	select TRACER_MAX_TRACE
 	select RING_BUFFER_ALLOW_SWAP
 	select TRACER_SNAPSHOT
+	select TRACER_SNAPSHOT_PER_CPU_SWAP
 	help
 	  This option measures the time spent in preemption-off critical
 	  sections, with microsecond accuracy.
@@ -266,6 +268,27 @@ config TRACER_SNAPSHOT
 	      echo 1 > /sys/kernel/debug/tracing/snapshot
 	      cat snapshot
 
+config TRACER_SNAPSHOT_PER_CPU_SWAP
+        bool "Allow snapshot to swap per CPU"
+	depends on TRACER_SNAPSHOT
+	select RING_BUFFER_ALLOW_SWAP
+	help
+	  Allow doing a snapshot of a single CPU buffer instead of a
+	  full swap (all buffers). If this is set, then the following is
+	  allowed:
+
+	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+
+	  After which, only the tracing buffer for CPU 2 was swapped with
+	  the main tracing buffer, and the other CPU buffers remain the same.
+
+	  When this is enabled, this adds a little more overhead to the
+	  trace recording, as it needs to add some checks to synchronize
+	  recording with swaps. But this does not affect the performance
+	  of the overall system. This is enabled by default when the preempt
+	  or irq latency tracers are enabled, as those need to swap as well
+	  and already adds the overhead (plus a lot more).
+
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
-- 
cgit v1.2.3


From 6de58e6269cd0568ca5fbae14423914eff0f7811 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 16:18:16 -0500
Subject: tracing: Add snapshot_raw to extract the raw data from snapshot

Add a 'snapshot_raw' per_cpu file that allows tools to read the raw
binary data of the snapshot buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 113 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 18 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 303932688964..9bb0b52cbd32 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4206,6 +4206,12 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 	return single_open(file, tracing_clock_show, inode->i_private);
 }
 
+struct ftrace_buffer_info {
+	struct trace_iterator	iter;
+	void			*spare;
+	unsigned int		read;
+};
+
 #ifdef CONFIG_TRACER_SNAPSHOT
 static int tracing_snapshot_open(struct inode *inode, struct file *file)
 {
@@ -4336,6 +4342,35 @@ static int tracing_snapshot_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int tracing_buffers_open(struct inode *inode, struct file *filp);
+static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+				    size_t count, loff_t *ppos);
+static int tracing_buffers_release(struct inode *inode, struct file *file);
+static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+		   struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+	struct ftrace_buffer_info *info;
+	int ret;
+
+	ret = tracing_buffers_open(inode, filp);
+	if (ret < 0)
+		return ret;
+
+	info = filp->private_data;
+
+	if (info->iter.trace->use_max_tr) {
+		tracing_buffers_release(inode, filp);
+		return -EBUSY;
+	}
+
+	info->iter.snapshot = true;
+	info->iter.trace_buffer = &info->iter.tr->max_buffer;
+
+	return ret;
+}
+
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 
@@ -4402,14 +4437,17 @@ static const struct file_operations snapshot_fops = {
 	.llseek		= tracing_seek,
 	.release	= tracing_snapshot_release,
 };
-#endif /* CONFIG_TRACER_SNAPSHOT */
 
-struct ftrace_buffer_info {
-	struct trace_iterator	iter;
-	void			*spare;
-	unsigned int		read;
+static const struct file_operations snapshot_raw_fops = {
+	.open		= snapshot_raw_open,
+	.read		= tracing_buffers_read,
+	.release	= tracing_buffers_release,
+	.splice_read	= tracing_buffers_splice_read,
+	.llseek		= no_llseek,
 };
 
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
 	struct trace_cpu *tc = inode->i_private;
@@ -4452,16 +4490,26 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	struct ftrace_buffer_info *info = filp->private_data;
 	struct trace_iterator *iter = &info->iter;
 	ssize_t ret;
-	size_t size;
+	ssize_t size;
 
 	if (!count)
 		return 0;
 
+	mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+		size = -EBUSY;
+		goto out_unlock;
+	}
+#endif
+
 	if (!info->spare)
 		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
 							  iter->cpu_file);
+	size = -ENOMEM;
 	if (!info->spare)
-		return -ENOMEM;
+		goto out_unlock;
 
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
@@ -4477,31 +4525,42 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	if (ret < 0) {
 		if (trace_empty(iter)) {
-			if ((filp->f_flags & O_NONBLOCK))
-				return -EAGAIN;
+			if ((filp->f_flags & O_NONBLOCK)) {
+				size = -EAGAIN;
+				goto out_unlock;
+			}
+			mutex_unlock(&trace_types_lock);
 			iter->trace->wait_pipe(iter);
-			if (signal_pending(current))
-				return -EINTR;
+			mutex_lock(&trace_types_lock);
+			if (signal_pending(current)) {
+				size = -EINTR;
+				goto out_unlock;
+			}
 			goto again;
 		}
-		return 0;
+		size = 0;
+		goto out_unlock;
 	}
 
 	info->read = 0;
-
  read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
 		size = count;
 
 	ret = copy_to_user(ubuf, info->spare + info->read, size);
-	if (ret == size)
-		return -EFAULT;
+	if (ret == size) {
+		size = -EFAULT;
+		goto out_unlock;
+	}
 	size -= ret;
 
 	*ppos += size;
 	info->read += size;
 
+ out_unlock:
+	mutex_unlock(&trace_types_lock);
+
 	return size;
 }
 
@@ -4591,10 +4650,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	};
 	struct buffer_ref *ref;
 	int entries, size, i;
-	size_t ret;
+	ssize_t ret;
 
-	if (splice_grow_spd(pipe, &spd))
-		return -ENOMEM;
+	mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+		ret = -EBUSY;
+		goto out;
+	}
+#endif
+
+	if (splice_grow_spd(pipe, &spd)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (*ppos & (PAGE_SIZE - 1)) {
 		ret = -EINVAL;
@@ -4666,7 +4736,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			ret = -EAGAIN;
 			goto out;
 		}
+		mutex_unlock(&trace_types_lock);
 		iter->trace->wait_pipe(iter);
+		mutex_lock(&trace_types_lock);
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
@@ -4677,6 +4749,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	ret = splice_to_pipe(pipe, &spd);
 	splice_shrink_spd(&spd);
 out:
+	mutex_unlock(&trace_types_lock);
+
 	return ret;
 }
 
@@ -4880,6 +4954,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 #ifdef CONFIG_TRACER_SNAPSHOT
 	trace_create_file("snapshot", 0644, d_cpu,
 			  (void *)&data->trace_cpu, &snapshot_fops);
+
+	trace_create_file("snapshot_raw", 0444, d_cpu,
+			(void *)&data->trace_cpu, &snapshot_raw_fops);
 #endif
 }
 
-- 
cgit v1.2.3


From 45ad21ca5530efdca6a19e4a5ac5e7bd6e24f996 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 18:25:02 -0500
Subject: tracing: Have trace_array keep track if snapshot buffer is allocated

The snapshot buffer belongs to the trace array not the tracer that is
running. The trace array should be the data structure that keeps track
of whether or not the snapshot buffer is allocated, not the tracer
desciptor. Having the trace array keep track of it makes modifications
so much easier.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 32 +++++++++++++++-----------------
 kernel/trace/trace.h |  2 +-
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9bb0b52cbd32..bcc9460c2d65 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -667,7 +667,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	if (!tr->current_trace->allocated_snapshot) {
+	if (!tr->allocated_snapshot) {
 		/* Only the nop tracer should hit this when disabling */
 		WARN_ON_ONCE(tr->current_trace != &nop_trace);
 		return;
@@ -700,7 +700,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (WARN_ON_ONCE(!tr->current_trace->allocated_snapshot))
+	if (WARN_ON_ONCE(!tr->allocated_snapshot))
 		return;
 
 	arch_spin_lock(&ftrace_max_lock);
@@ -802,7 +802,7 @@ int register_tracer(struct tracer *type)
 			if (ring_buffer_expanded)
 				ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
 						   RING_BUFFER_ALL_CPUS);
-			type->allocated_snapshot = true;
+			tr->allocated_snapshot = true;
 		}
 #endif
 
@@ -822,7 +822,7 @@ int register_tracer(struct tracer *type)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		if (type->use_max_tr) {
-			type->allocated_snapshot = false;
+			tr->allocated_snapshot = false;
 
 			/* Shrink the max buffer again */
 			if (ring_buffer_expanded)
@@ -2463,7 +2463,7 @@ static void show_snapshot_percpu_help(struct seq_file *m)
 
 static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
 {
-	if (iter->trace->allocated_snapshot)
+	if (iter->tr->allocated_snapshot)
 		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
 	else
 		seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
@@ -3364,12 +3364,12 @@ static int tracing_set_tracer(const char *buf)
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-	had_max_tr = tr->current_trace->allocated_snapshot;
-
 	/* Current trace needs to be nop_trace before synchronize_sched */
 	tr->current_trace = &nop_trace;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
+	had_max_tr = tr->allocated_snapshot;
+
 	if (had_max_tr && !t->use_max_tr) {
 		/*
 		 * We need to make sure that the update_max_tr sees that
@@ -3387,10 +3387,8 @@ static int tracing_set_tracer(const char *buf)
 		ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
 		set_buffer_entries(&tr->max_buffer, 1);
 		tracing_reset_online_cpus(&tr->max_buffer);
-		tr->current_trace->allocated_snapshot = false;
+		tr->allocated_snapshot = false;
 	}
-#else
-	tr->current_trace = &nop_trace;
 #endif
 	destroy_trace_option_files(topts);
 
@@ -3403,7 +3401,7 @@ static int tracing_set_tracer(const char *buf)
 						   RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
 			goto out;
-		t->allocated_snapshot = true;
+		tr->allocated_snapshot = true;
 	}
 #endif
 
@@ -4275,13 +4273,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 			ret = -EINVAL;
 			break;
 		}
-		if (tr->current_trace->allocated_snapshot) {
+		if (tr->allocated_snapshot) {
 			/* free spare buffer */
 			ring_buffer_resize(tr->max_buffer.buffer, 1,
 					   RING_BUFFER_ALL_CPUS);
 			set_buffer_entries(&tr->max_buffer, 1);
 			tracing_reset_online_cpus(&tr->max_buffer);
-			tr->current_trace->allocated_snapshot = false;
+			tr->allocated_snapshot = false;
 		}
 		break;
 	case 1:
@@ -4292,13 +4290,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 			break;
 		}
 #endif
-		if (!tr->current_trace->allocated_snapshot) {
+		if (!tr->allocated_snapshot) {
 			/* allocate spare buffer */
 			ret = resize_buffer_duplicate_size(&tr->max_buffer,
 					&tr->trace_buffer, RING_BUFFER_ALL_CPUS);
 			if (ret < 0)
 				break;
-			tr->current_trace->allocated_snapshot = true;
+			tr->allocated_snapshot = true;
 		}
 		local_irq_disable();
 		/* Now, we're going to swap */
@@ -4309,7 +4307,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		local_irq_enable();
 		break;
 	default:
-		if (tr->current_trace->allocated_snapshot) {
+		if (tr->allocated_snapshot) {
 			if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
 				tracing_reset_online_cpus(&tr->max_buffer);
 			else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 986834f1f4dd..1a456c291a07 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -197,6 +197,7 @@ struct trace_array {
 	 * the trace_buffer so the tracing can continue.
 	 */
 	struct trace_buffer	max_buffer;
+	bool			allocated_snapshot;
 #endif
 	int			buffer_disabled;
 	struct trace_cpu	trace_cpu;	/* place holder */
@@ -367,7 +368,6 @@ struct tracer {
 	bool			enabled;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	bool			use_max_tr;
-	bool			allocated_snapshot;
 #endif
 };
 
-- 
cgit v1.2.3


From 737223fbca3b1c91feb947c7f571b35749b743b6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 21:13:47 -0500
Subject: tracing: Consolidate buffer allocation code

There's a bit of duplicate code in creating the trace buffers for
the normal trace buffer and the max trace buffer among the instances
and the main global_trace. This code can be consolidated and cleaned
up a bit making the code cleaner and more readable as well as less
duplication.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 130 +++++++++++++++++++++++++--------------------------
 1 file changed, 63 insertions(+), 67 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bcc9460c2d65..57895d476509 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3171,6 +3171,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
 {
 	int cpu;
+
 	for_each_tracing_cpu(cpu)
 		per_cpu_ptr(buf->data, cpu)->entries = val;
 }
@@ -5267,12 +5268,70 @@ struct dentry *trace_instance_dir;
 static void
 init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
 
-static int new_instance_create(const char *name)
+static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
+{
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
+		per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
+	}
+}
+
+static int allocate_trace_buffers(struct trace_array *tr, int size)
 {
 	enum ring_buffer_flags rb_flags;
+
+	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
+	tr->trace_buffer.buffer = ring_buffer_alloc(size, rb_flags);
+	if (!tr->trace_buffer.buffer)
+		goto out_free;
+
+	tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!tr->trace_buffer.data)
+		goto out_free;
+
+	init_trace_buffers(tr, &tr->trace_buffer);
+
+	/* Allocate the first page for all buffers */
+	set_buffer_entries(&tr->trace_buffer,
+			   ring_buffer_size(tr->trace_buffer.buffer, 0));
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+	tr->max_buffer.buffer = ring_buffer_alloc(1, rb_flags);
+	if (!tr->max_buffer.buffer)
+		goto out_free;
+
+	tr->max_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!tr->max_buffer.data)
+		goto out_free;
+
+	init_trace_buffers(tr, &tr->max_buffer);
+
+	set_buffer_entries(&tr->max_buffer, 1);
+#endif
+	return 0;
+
+ out_free:
+	if (tr->trace_buffer.buffer)
+		ring_buffer_free(tr->trace_buffer.buffer);
+	free_percpu(tr->trace_buffer.data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (tr->max_buffer.buffer)
+		ring_buffer_free(tr->max_buffer.buffer);
+	free_percpu(tr->max_buffer.data);
+#endif
+	return -ENOMEM;
+}
+
+static int new_instance_create(const char *name)
+{
 	struct trace_array *tr;
 	int ret;
-	int i;
 
 	mutex_lock(&trace_types_lock);
 
@@ -5298,22 +5357,9 @@ static int new_instance_create(const char *name)
 	INIT_LIST_HEAD(&tr->systems);
 	INIT_LIST_HEAD(&tr->events);
 
-	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
-
-	tr->trace_buffer.buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
-	if (!tr->trace_buffer.buffer)
-		goto out_free_tr;
-
-	tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu);
-	if (!tr->trace_buffer.data)
+	if (allocate_trace_buffers(tr, trace_buf_size) < 0)
 		goto out_free_tr;
 
-	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(tr->trace_buffer.data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.tr = tr;
-	}
-
 	/* Holder for file callbacks */
 	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
 	tr->trace_cpu.tr = tr;
@@ -5736,8 +5782,6 @@ EXPORT_SYMBOL_GPL(ftrace_dump);
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
-	enum ring_buffer_flags rb_flags;
-	int i;
 	int ret = -ENOMEM;
 
 
@@ -5758,69 +5802,21 @@ __init static int tracer_alloc_buffers(void)
 	else
 		ring_buf_size = 1;
 
-	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
-
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 
 	raw_spin_lock_init(&global_trace.start_lock);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.trace_buffer.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
-	if (!global_trace.trace_buffer.buffer) {
+	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
 
-	global_trace.trace_buffer.data = alloc_percpu(struct trace_array_cpu);
-
-	if (!global_trace.trace_buffer.data) {
-		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
-		WARN_ON(1);
-		goto out_free_cpumask;
-	}
-
-	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(global_trace.trace_buffer.data, i), 0,
-		       sizeof(struct trace_array_cpu));
-		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.tr = &global_trace;
-	}
-
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-	global_trace.max_buffer.data = alloc_percpu(struct trace_array_cpu);
-	if (!global_trace.max_buffer.data) {
-		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
-		WARN_ON(1);
-		goto out_free_cpumask;
-	}
-	global_trace.max_buffer.buffer = ring_buffer_alloc(1, rb_flags);
-	if (!global_trace.max_buffer.buffer) {
-		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
-		WARN_ON(1);
-		ring_buffer_free(global_trace.trace_buffer.buffer);
-		goto out_free_cpumask;
-	}
-
-	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(global_trace.max_buffer.data, i), 0,
-		       sizeof(struct trace_array_cpu));
-		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.tr = &global_trace;
-	}
-#endif
-
-	/* Allocate the first page for all buffers */
-	set_buffer_entries(&global_trace.trace_buffer,
-			   ring_buffer_size(global_trace.trace_buffer.buffer, 0));
-#ifdef CONFIG_TRACER_MAX_TRACE
-	set_buffer_entries(&global_trace.max_buffer, 1);
-#endif
-
 	trace_init_cmdlines();
 
 	register_tracer(&nop_trace);
-- 
cgit v1.2.3


From ce9bae55972b228cf7bac34350c4d2caf8ea0d0b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 21:23:55 -0500
Subject: tracing: Add snapshot feature to instances

Add the "snapshot" file to the the multi-buffer instances.

  cd /sys/kernel/debug/tracing/instances
  mkdir foo
  ls foo
buffer_size_kb  buffer_total_size_kb  events  free_buffer  set_event
snapshot  trace  trace_clock  trace_marker  trace_options  trace_pipe
tracing_on
  cat foo/snapshot
 # tracer: nop
 #
 #
 # * Snapshot is freed *
 #
 # Snapshot commands:
 # echo 0 > snapshot : Clears and frees snapshot buffer
 # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.
 #                      Takes a snapshot of the main buffer.
 # echo 2 > snapshot : Clears snapshot buffer (but does not allocate)
 #                      (Doesn't have to be '2' works with any number that
 #                       is not a '0' or '1')

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57895d476509..17671bc9a4b1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4302,9 +4302,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		local_irq_disable();
 		/* Now, we're going to swap */
 		if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
-			update_max_tr(&global_trace, current, smp_processor_id());
+			update_max_tr(tr, current, smp_processor_id());
 		else
-			update_max_tr_single(&global_trace, current, iter->cpu_file);
+			update_max_tr_single(tr, current, iter->cpu_file);
 		local_irq_enable();
 		break;
 	default:
@@ -5533,6 +5533,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 
 	trace_create_file("tracing_on", 0644, d_tracer,
 			    tr, &rb_simple_fops);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+	trace_create_file("snapshot", 0644, d_tracer,
+			  (void *)&tr->trace_cpu, &snapshot_fops);
+#endif
 }
 
 static __init int tracer_init_debugfs(void)
@@ -5574,11 +5579,6 @@ static __init int tracer_init_debugfs(void)
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 
-#ifdef CONFIG_TRACER_SNAPSHOT
-	trace_create_file("snapshot", 0644, d_tracer,
-			  (void *)&global_trace.trace_cpu, &snapshot_fops);
-#endif
-
 	create_trace_instances(d_tracer);
 
 	create_trace_options_dir(&global_trace);
-- 
cgit v1.2.3


From 121aaee7b0a82605d33af200c7e9ebab6fd6e444 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 21:52:25 -0500
Subject: tracing: Add per_cpu directory into tracing instances

Add the per_cpu directory to the created tracing instances:

  cd /sys/kernel/debug/tracing/instances
  mkdir foo
  ls foo/per_cpu/cpu0
buffer_size_kb	snapshot_raw  trace	  trace_pipe_raw
snapshot	stats	      trace_pipe

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 17671bc9a4b1..c547ebbe36ff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5506,6 +5506,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 {
+	int cpu;
 
 	trace_create_file("trace_options", 0644, d_tracer,
 			  tr, &tracing_iter_fops);
@@ -5538,12 +5539,15 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("snapshot", 0644, d_tracer,
 			  (void *)&tr->trace_cpu, &snapshot_fops);
 #endif
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_debugfs_percpu(tr, cpu);
+
 }
 
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	int cpu;
 
 	trace_access_lock_init();
 
@@ -5583,9 +5587,6 @@ static __init int tracer_init_debugfs(void)
 
 	create_trace_options_dir(&global_trace);
 
-	for_each_tracing_cpu(cpu)
-		tracing_init_debugfs_percpu(&global_trace, cpu);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From a695cb5816228f86576f5f5c6809fdf8ed382ece Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Wed, 6 Mar 2013 15:27:24 -0500
Subject: tracing: Prevent deleting instances when they are being read

Add a ref count to the trace_array structure and prevent removal
of instances that have open descriptors.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 23 +++++++++++++++++++++++
 kernel/trace/trace.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c547ebbe36ff..3a89496dc99b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2613,6 +2613,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 		tracing_iter_reset(iter, cpu);
 	}
 
+	tr->ref++;
+
 	mutex_unlock(&trace_types_lock);
 
 	return iter;
@@ -2649,6 +2651,10 @@ static int tracing_release(struct inode *inode, struct file *file)
 	tr = iter->tr;
 
 	mutex_lock(&trace_types_lock);
+
+	WARN_ON(!tr->ref);
+	tr->ref--;
+
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -4460,6 +4466,10 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	if (!info)
 		return -ENOMEM;
 
+	mutex_lock(&trace_types_lock);
+
+	tr->ref++;
+
 	info->iter.tr		= tr;
 	info->iter.cpu_file	= tc->cpu;
 	info->iter.trace	= tr->current_trace;
@@ -4470,6 +4480,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = info;
 
+	mutex_unlock(&trace_types_lock);
+
 	return nonseekable_open(inode, filp);
 }
 
@@ -4568,10 +4580,17 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 	struct ftrace_buffer_info *info = file->private_data;
 	struct trace_iterator *iter = &info->iter;
 
+	mutex_lock(&trace_types_lock);
+
+	WARN_ON(!iter->tr->ref);
+	iter->tr->ref--;
+
 	if (info->spare)
 		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
 	kfree(info);
 
+	mutex_unlock(&trace_types_lock);
+
 	return 0;
 }
 
@@ -5411,6 +5430,10 @@ static int instance_delete(const char *name)
 	if (!found)
 		goto out_unlock;
 
+	ret = -EBUSY;
+	if (tr->ref)
+		goto out_unlock;
+
 	list_del(&tr->list);
 
 	event_trace_del_tracer(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1a456c291a07..f4931821a966 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -219,6 +219,7 @@ struct trace_array {
 	struct list_head	systems;
 	struct list_head	events;
 	struct task_struct	*waiter;
+	int			ref;
 };
 
 enum {
-- 
cgit v1.2.3


From ad909e21bbe69f1d39055d346540abd827190eca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Wed, 6 Mar 2013 21:45:37 -0500
Subject: tracing: Add internal tracing_snapshot() functions

The new snapshot feature is quite handy. It's a way for the user
to take advantage of the spare buffer that, until then, only
the latency tracers used to "snapshot" the buffer when it hit
a max latency. Now users can trigger a "snapshot" manually when
some condition is hit in a program. But a snapshot currently can
not be triggered by a condition inside the kernel.

With the addition of tracing_snapshot() and tracing_snapshot_alloc(),
snapshots can now be taking when a condition is hit, and the
developer wants to snapshot the case without stopping the trace.

Note, any snapshot will overwrite the old one, so take care
in how this is done.

These new functions are to be used like tracing_on(), tracing_off()
and trace_printk() are. That is, they should never be called
in the mainline Linux kernel. They are solely for the purpose
of debugging.

The tracing_snapshot() will not allocate a buffer, but it is
safe to be called from any context (except NMIs). But if a
snapshot buffer isn't allocated when it is called, it will write
to the live buffer, complaining about the lack of a snapshot
buffer, and then stop tracing (giving you the "permanent snapshot").

tracing_snapshot_alloc() will allocate the snapshot buffer if
it was not already allocated and then take the snapshot. This routine
*may sleep*, and must be called from context that can sleep.
The allocation is done with GFP_KERNEL and not atomic.

If you need a snapshot in an atomic context, say in early boot,
then it is best to call the tracing_snapshot_alloc() before then,
where it will allocate the buffer, and then you can use the
tracing_snapshot() anywhere you want and still get snapshots.

Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3a89496dc99b..307524d784ec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -339,6 +339,90 @@ void tracing_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_on);
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *tracer = tr->current_trace;
+	unsigned long flags;
+
+	if (!tr->allocated_snapshot) {
+		trace_printk("*** SNAPSHOT NOT ALLOCATED ***\n");
+		trace_printk("*** stopping trace here!   ***\n");
+		tracing_off();
+		return;
+	}
+
+	/* Note, snapshot can not be used when the tracer uses it */
+	if (tracer->use_max_tr) {
+		trace_printk("*** LATENCY TRACER ACTIVE ***\n");
+		trace_printk("*** Can not use snapshot (sorry) ***\n");
+		return;
+	}
+
+	local_irq_save(flags);
+	update_max_tr(tr, current, smp_processor_id());
+	local_irq_restore(flags);
+}
+
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+					struct trace_buffer *size_buf, int cpu_id);
+
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+	struct trace_array *tr = &global_trace;
+	int ret;
+
+	if (!tr->allocated_snapshot) {
+
+		/* allocate spare buffer */
+		ret = resize_buffer_duplicate_size(&tr->max_buffer,
+				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+		if (WARN_ON(ret < 0))
+			return;
+
+		tr->allocated_snapshot = true;
+	}
+
+	tracing_snapshot();
+}
+#else
+void tracing_snapshot(void)
+{
+	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+void tracing_snapshot_alloc(void)
+{
+	/* Give warning */
+	tracing_snapshot();
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
 /**
  * tracing_off - turn off tracing buffers
  *
-- 
cgit v1.2.3


From f5eb5588262cab7232ed1d77cf612b327db50767 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Thu, 7 Mar 2013 09:27:42 -0500
Subject: ring-buffer: Do not use schedule_work_on() for current CPU

The ring buffer updates when done while the ring buffer is active,
needs to be completed on the CPU that is used for the ring buffer
per_cpu buffer. To accomplish this, schedule_work_on() is used to
schedule work on the given CPU.

Now there's no reason to use schedule_work_on() if the process
doing the update happens to be on the CPU that it is processing.
It has already filled the requirement. Instead, just do the work
and continue.

This is needed for tracing_snapshot_alloc() where it may be called
really early in boot, where the work queues have not been set up yet.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 65fe2a4f9824..d1c85c5f5f51 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1679,11 +1679,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 			if (!cpu_buffer->nr_pages_to_update)
 				continue;
 
-			if (cpu_online(cpu))
+			/* The update must run on the CPU that is being updated. */
+			preempt_disable();
+			if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+				rb_update_pages(cpu_buffer);
+				cpu_buffer->nr_pages_to_update = 0;
+			} else {
+				/*
+				 * Can not disable preemption for schedule_work_on()
+				 * on PREEMPT_RT.
+				 */
+				preempt_enable();
 				schedule_work_on(cpu,
 						&cpu_buffer->update_pages_work);
-			else
-				rb_update_pages(cpu_buffer);
+				preempt_disable();
+			}
+			preempt_enable();
 		}
 
 		/* wait for all the updates to complete */
@@ -1721,12 +1732,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 
 		get_online_cpus();
 
-		if (cpu_online(cpu_id)) {
+		preempt_disable();
+		/* The update must run on the CPU that is being updated. */
+		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+			rb_update_pages(cpu_buffer);
+		else {
+			/*
+			 * Can not disable preemption for schedule_work_on()
+			 * on PREEMPT_RT.
+			 */
+			preempt_enable();
 			schedule_work_on(cpu_id,
 					 &cpu_buffer->update_pages_work);
 			wait_for_completion(&cpu_buffer->update_done);
-		} else
-			rb_update_pages(cpu_buffer);
+			preempt_disable();
+		}
+		preempt_enable();
 
 		cpu_buffer->nr_pages_to_update = 0;
 		put_online_cpus();
-- 
cgit v1.2.3


From f4e781c0a89d5810729772290441ac7d61f321ec Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Thu, 7 Mar 2013 11:10:56 -0500
Subject: tracing: Move the tracing selftest code into its own function

Move the tracing startup selftest code into its own function and
when not enabled, always have that function succeed.

This makes the register_tracer() function much more readable.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 124 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 69 insertions(+), 55 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 307524d784ec..57b4220d96a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -818,6 +818,72 @@ static void default_wait_pipe(struct trace_iterator *iter)
 	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
 }
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int run_tracer_selftest(struct tracer *type)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *saved_tracer = tr->current_trace;
+	int ret;
+
+	if (!type->selftest || tracing_selftest_disabled)
+		return 0;
+
+	/*
+	 * Run a selftest on this tracer.
+	 * Here we reset the trace buffer, and set the current
+	 * tracer to be this tracer. The tracer can then run some
+	 * internal tracing to verify that everything is in order.
+	 * If we fail, we do not register this tracer.
+	 */
+	tracing_reset_online_cpus(&tr->trace_buffer);
+
+	tr->current_trace = type;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (type->use_max_tr) {
+		/* If we expanded the buffers, make sure the max is expanded too */
+		if (ring_buffer_expanded)
+			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+					   RING_BUFFER_ALL_CPUS);
+		tr->allocated_snapshot = true;
+	}
+#endif
+
+	/* the test is responsible for initializing and enabling */
+	pr_info("Testing tracer %s: ", type->name);
+	ret = type->selftest(type, tr);
+	/* the test is responsible for resetting too */
+	tr->current_trace = saved_tracer;
+	if (ret) {
+		printk(KERN_CONT "FAILED!\n");
+		/* Add the warning after printing 'FAILED' */
+		WARN_ON(1);
+		return -1;
+	}
+	/* Only reset on passing, to avoid touching corrupted buffers */
+	tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (type->use_max_tr) {
+		tr->allocated_snapshot = false;
+
+		/* Shrink the max buffer again */
+		if (ring_buffer_expanded)
+			ring_buffer_resize(tr->max_buffer.buffer, 1,
+					   RING_BUFFER_ALL_CPUS);
+	}
+#endif
+
+	printk(KERN_CONT "PASSED\n");
+	return 0;
+}
+#else
+static inline int run_tracer_selftest(struct tracer *type)
+{
+	return 0;
+}
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
 /**
  * register_tracer - register a tracer with the ftrace system.
  * @type - the plugin for the tracer
@@ -863,61 +929,9 @@ int register_tracer(struct tracer *type)
 	if (!type->wait_pipe)
 		type->wait_pipe = default_wait_pipe;
 
-
-#ifdef CONFIG_FTRACE_STARTUP_TEST
-	if (type->selftest && !tracing_selftest_disabled) {
-		struct trace_array *tr = &global_trace;
-		struct tracer *saved_tracer = tr->current_trace;
-
-		/*
-		 * Run a selftest on this tracer.
-		 * Here we reset the trace buffer, and set the current
-		 * tracer to be this tracer. The tracer can then run some
-		 * internal tracing to verify that everything is in order.
-		 * If we fail, we do not register this tracer.
-		 */
-		tracing_reset_online_cpus(&tr->trace_buffer);
-
-		tr->current_trace = type;
-
-#ifdef CONFIG_TRACER_MAX_TRACE
-		if (type->use_max_tr) {
-			/* If we expanded the buffers, make sure the max is expanded too */
-			if (ring_buffer_expanded)
-				ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
-						   RING_BUFFER_ALL_CPUS);
-			tr->allocated_snapshot = true;
-		}
-#endif
-
-		/* the test is responsible for initializing and enabling */
-		pr_info("Testing tracer %s: ", type->name);
-		ret = type->selftest(type, tr);
-		/* the test is responsible for resetting too */
-		tr->current_trace = saved_tracer;
-		if (ret) {
-			printk(KERN_CONT "FAILED!\n");
-			/* Add the warning after printing 'FAILED' */
-			WARN_ON(1);
-			goto out;
-		}
-		/* Only reset on passing, to avoid touching corrupted buffers */
-		tracing_reset_online_cpus(&tr->trace_buffer);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
-		if (type->use_max_tr) {
-			tr->allocated_snapshot = false;
-
-			/* Shrink the max buffer again */
-			if (ring_buffer_expanded)
-				ring_buffer_resize(tr->max_buffer.buffer, 1,
-						   RING_BUFFER_ALL_CPUS);
-		}
-#endif
-
-		printk(KERN_CONT "PASSED\n");
-	}
-#endif
+	ret = run_tracer_selftest(type);
+	if (ret < 0)
+		goto out;
 
 	type->next = trace_types;
 	trace_types = type;
-- 
cgit v1.2.3


From 55034cd6e648155393b0d665eef76b38d49ad6bf Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Thu, 7 Mar 2013 22:48:09 -0500
Subject: tracing: Add alloc_snapshot kernel command line parameter

If debugging the kernel, and the developer wants to use
tracing_snapshot() in places where tracing_snapshot_alloc() may
be difficult (or more likely, the developer is lazy and doesn't
want to bother with tracing_snapshot_alloc() at all), then adding

  alloc_snapshot

to the kernel command line parameter will tell ftrace to allocate
the snapshot buffer (if configured) when it allocates the main
tracing buffer.

I also noticed that ring_buffer_expanded and tracing_selftest_disabled
had inconsistent use of boolean "true" and "false" with "0" and "1".
I cleaned that up too.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 81 +++++++++++++++++++++++++++------------------
 kernel/trace/trace.h        |  2 +-
 kernel/trace/trace_events.c |  4 +--
 3 files changed, 51 insertions(+), 36 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57b4220d96a9..4021a5e66412 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -47,7 +47,7 @@
  * On boot up, the ring buffer is set to the minimum size, so that
  * we do not waste memory on systems that are not using tracing.
  */
-int ring_buffer_expanded;
+bool ring_buffer_expanded;
 
 /*
  * We need to change this state when a selftest is running.
@@ -121,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
 
+static bool allocate_snapshot;
+
 static int __init set_cmdline_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
-	ring_buffer_expanded = 1;
+	ring_buffer_expanded = true;
 	return 1;
 }
 __setup("ftrace=", set_cmdline_ftrace);
@@ -147,6 +149,15 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
+static int __init alloc_snapshot(char *str)
+{
+	allocate_snapshot = true;
+	/* We also need the main ring buffer expanded */
+	ring_buffer_expanded = true;
+	return 1;
+}
+__setup("alloc_snapshot", alloc_snapshot);
+
 
 static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
 static char *trace_boot_options __initdata;
@@ -951,7 +962,7 @@ int register_tracer(struct tracer *type)
 	tracing_set_tracer(type->name);
 	default_bootup_tracer = NULL;
 	/* disable other selftests, since this will break it. */
-	tracing_selftest_disabled = 1;
+	tracing_selftest_disabled = true;
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
 	       type->name);
@@ -3318,7 +3329,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	 * we use the size that was given, and we can forget about
 	 * expanding it later.
 	 */
-	ring_buffer_expanded = 1;
+	ring_buffer_expanded = true;
 
 	/* May be called before buffers are initialized */
 	if (!tr->trace_buffer.buffer)
@@ -5396,53 +5407,57 @@ static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
 	}
 }
 
-static int allocate_trace_buffers(struct trace_array *tr, int size)
+static int
+allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
 {
 	enum ring_buffer_flags rb_flags;
 
 	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
 
-	tr->trace_buffer.buffer = ring_buffer_alloc(size, rb_flags);
-	if (!tr->trace_buffer.buffer)
-		goto out_free;
+	buf->buffer = ring_buffer_alloc(size, rb_flags);
+	if (!buf->buffer)
+		return -ENOMEM;
 
-	tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu);
-	if (!tr->trace_buffer.data)
-		goto out_free;
+	buf->data = alloc_percpu(struct trace_array_cpu);
+	if (!buf->data) {
+		ring_buffer_free(buf->buffer);
+		return -ENOMEM;
+	}
 
-	init_trace_buffers(tr, &tr->trace_buffer);
+	init_trace_buffers(tr, buf);
 
 	/* Allocate the first page for all buffers */
 	set_buffer_entries(&tr->trace_buffer,
 			   ring_buffer_size(tr->trace_buffer.buffer, 0));
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-
-	tr->max_buffer.buffer = ring_buffer_alloc(1, rb_flags);
-	if (!tr->max_buffer.buffer)
-		goto out_free;
-
-	tr->max_buffer.data = alloc_percpu(struct trace_array_cpu);
-	if (!tr->max_buffer.data)
-		goto out_free;
+	return 0;
+}
 
-	init_trace_buffers(tr, &tr->max_buffer);
+static int allocate_trace_buffers(struct trace_array *tr, int size)
+{
+	int ret;
 
-	set_buffer_entries(&tr->max_buffer, 1);
-#endif
-	return 0;
+	ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+	if (ret)
+		return ret;
 
- out_free:
-	if (tr->trace_buffer.buffer)
+#ifdef CONFIG_TRACER_MAX_TRACE
+	ret = allocate_trace_buffer(tr, &tr->max_buffer,
+				    allocate_snapshot ? size : 1);
+	if (WARN_ON(ret)) {
 		ring_buffer_free(tr->trace_buffer.buffer);
-	free_percpu(tr->trace_buffer.data);
+		free_percpu(tr->trace_buffer.data);
+		return -ENOMEM;
+	}
+	tr->allocated_snapshot = allocate_snapshot;
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-	if (tr->max_buffer.buffer)
-		ring_buffer_free(tr->max_buffer.buffer);
-	free_percpu(tr->max_buffer.data);
+	/*
+	 * Only the top level trace array gets its snapshot allocated
+	 * from the kernel command line.
+	 */
+	allocate_snapshot = false;
 #endif
-	return -ENOMEM;
+	return 0;
 }
 
 static int new_instance_create(const char *name)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f4931821a966..26bc71834041 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -660,7 +660,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
 extern int DYN_FTRACE_TEST_NAME2(void);
 
-extern int ring_buffer_expanded;
+extern bool ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
 DECLARE_PER_CPU(int, ftrace_cpu_disabled);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a376ab5eec5c..38b54c5edeb9 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1844,8 +1844,8 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 static __init int setup_trace_event(char *str)
 {
 	strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
-	ring_buffer_expanded = 1;
-	tracing_selftest_disabled = 1;
+	ring_buffer_expanded = true;
+	tracing_selftest_disabled = true;
 
 	return 1;
 }
-- 
cgit v1.2.3


From 153e8ed913b022d2003866a848af9fadc041403f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 8 Mar 2013 10:40:07 -0500
Subject: tracing: Fix the branch tracer that broke with buffer change

The changce to add the trace_buffer struct to have the trace array
have both the main buffer and max buffer broke the branch tracer
because the change did not update that code. As the branch tracer
adds a significant amount of overhead, and must be selected via
a selection (not a allyesconfig) it was missed in testing.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_branch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6dadbefbb1d6..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -52,12 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	if (atomic_inc_return(&data->disabled) != 1)
 		goto out;
 
 	pc = preempt_count();
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-- 
cgit v1.2.3


From 09ae72348eccb60e304cf8ce94653f4a78fcd407 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 8 Mar 2013 21:02:34 -0500
Subject: tracing: Add trace_puts() for even faster trace_printk() tracing

The trace_printk() is extremely fast and is very handy as it can be
used in any context (including NMIs!). But it still requires scanning
the fmt string for parsing the args. Even the trace_bprintk() requires
a scan to know what args will be saved, although it doesn't copy the
format string itself.

Several times trace_printk() has no args, and wastes cpu cycles scanning
the fmt string.

Adding trace_puts() allows the developer to use an even faster
tracing method that only saves the pointer to the string in the
ring buffer without doing any format parsing at all. This will
help remove even more of the "Heisenbug" effect, when debugging.

Also fixed up the F_printk()s for the ftrace internal bprint and print events.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c         | 76 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         |  2 ++
 kernel/trace/trace_entries.h | 23 +++++++++++---
 kernel/trace/trace_output.c  | 75 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h  |  2 ++
 5 files changed, 174 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4021a5e66412..5043a0c4dde0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -350,6 +350,77 @@ void tracing_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_on);
 
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip:	   The address of the caller
+ * @str:   The constant string to write
+ * @size:  The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+	int alloc;
+
+	alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+	local_save_flags(irq_flags);
+	buffer = global_trace.trace_buffer.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
+					  irq_flags, preempt_count());
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->ip = ip;
+
+	memcpy(&entry->buf, str, size);
+
+	/* Add a newline if necessary */
+	if (entry->buf[size - 1] != '\n') {
+		entry->buf[size] = '\n';
+		entry->buf[size + 1] = '\0';
+	} else
+		entry->buf[size] = '\0';
+
+	__buffer_unlock_commit(buffer, event);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip:	   The address of the caller
+ * @str:   The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct bputs_entry *entry;
+	unsigned long irq_flags;
+	int size = sizeof(struct bputs_entry);
+
+	local_save_flags(irq_flags);
+	buffer = global_trace.trace_buffer.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+					  irq_flags, preempt_count());
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->str			= str;
+
+	__buffer_unlock_commit(buffer, event);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
 #ifdef CONFIG_TRACER_SNAPSHOT
 /**
  * trace_snapshot - take a snapshot of the current buffer.
@@ -2475,6 +2546,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPUTS &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return trace_print_bputs_msg_only(iter);
+
 	if (iter->ent->type == TRACE_BPRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 26bc71834041..d5764a8532e2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,6 +34,7 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BLK,
+	TRACE_BPUTS,
 
 	__TRACE_LAST_TYPE,
 };
@@ -277,6 +278,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
+		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
 		__dynamic_array(	u32,	buf	)
 	),
 
-	F_printk("%08lx fmt:%p",
-		 __entry->ip, __entry->fmt),
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->fmt),
 
 	FILTER_OTHER
 );
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
 		__dynamic_array(	char,	buf	)
 	),
 
-	F_printk("%08lx %s",
-		 __entry->ip, __entry->buf),
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->buf),
+
+	FILTER_OTHER
+);
+
+FTRACE_ENTRY(bputs, bputs_entry,
+
+	TRACE_BPUTS,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip	)
+		__field(	const char *,	str	)
+	),
+
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->str),
 
 	FILTER_OTHER
 );
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 2edc7220d017..19f48e7edc39 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	return ret;
 }
 
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bputs_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_puts(s, field->str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1244,6 +1260,64 @@ static struct trace_event trace_user_stack_event = {
 	.funcs		= &trace_user_stack_funcs,
 };
 
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+		   struct trace_event *event)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bputs_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_puts(s, field->str))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+		struct trace_event *event)
+{
+	struct bputs_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_puts(s, field->str))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event_functions trace_bputs_funcs = {
+	.trace		= trace_bputs_print,
+	.raw		= trace_bputs_raw,
+};
+
+static struct trace_event trace_bputs_event = {
+	.type		= TRACE_BPUTS,
+	.funcs		= &trace_bputs_funcs,
+};
+
 /* TRACE_BPRINT */
 static enum print_line_t
 trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1356,6 +1430,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_wake_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bputs_event,
 	&trace_bprint_event,
 	&trace_print_event,
 	NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..af77870de278 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,6 +4,8 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
+extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
-- 
cgit v1.2.3


From ca268da6e415448a43138e1abc5d5f057af319d7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Sat, 9 Mar 2013 00:40:58 -0500
Subject: tracing: Add internal ftrace trace_puts() for ftrace to use

There's a few places that ftrace uses trace_printk() for internal
use, but this requires context (normal, softirq, irq, NMI) buffers
to keep things lockless. But the trace_puts() does not, as it can
write the string directly into the ring buffer. Make a internal helper
for trace_puts() and have the internal functions use that.

This way the extra context buffers are not used.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |  8 ++++----
 kernel/trace/trace.h | 11 +++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5043a0c4dde0..d372c6504c99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -443,16 +443,16 @@ void tracing_snapshot(void)
 	unsigned long flags;
 
 	if (!tr->allocated_snapshot) {
-		trace_printk("*** SNAPSHOT NOT ALLOCATED ***\n");
-		trace_printk("*** stopping trace here!   ***\n");
+		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
+		internal_trace_puts("*** stopping trace here!   ***\n");
 		tracing_off();
 		return;
 	}
 
 	/* Note, snapshot can not be used when the tracer uses it */
 	if (tracer->use_max_tr) {
-		trace_printk("*** LATENCY TRACER ACTIVE ***\n");
-		trace_printk("*** Can not use snapshot (sorry) ***\n");
+		internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
+		internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
 		return;
 	}
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d5764a8532e2..0e430b401ab6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1040,6 +1040,17 @@ void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
+/*
+ * Normal trace_printk() and friends allocates special buffers
+ * to do the manipulation, as well as saves the print formats
+ * into sections to display. But the trace infrastructure wants
+ * to use these without the added overhead at the price of being
+ * a bit slower (used mainly for warnings, where we don't care
+ * about performance). The internal_trace_puts() is for such
+ * a purpose.
+ */
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
 	extern struct ftrace_event_call					\
-- 
cgit v1.2.3


From 1b22e382ab40b0e3ee5abb3e310dffb16fee22aa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Sat, 9 Mar 2013 00:56:08 -0500
Subject: tracing: Let tracing_snapshot() be used by modules but not NMI

Add EXPORT_SYMBOL_GPL() to let the tracing_snapshot() functions be
called from modules.

Also add a test to see if the snapshot was called from NMI context
and just warn in the tracing buffer if so, and return.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d372c6504c99..5c53e4092269 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -442,6 +442,12 @@ void tracing_snapshot(void)
 	struct tracer *tracer = tr->current_trace;
 	unsigned long flags;
 
+	if (in_nmi()) {
+		internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+		internal_trace_puts("*** snapshot is being ignored        ***\n");
+		return;
+	}
+
 	if (!tr->allocated_snapshot) {
 		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
 		internal_trace_puts("*** stopping trace here!   ***\n");
@@ -460,6 +466,7 @@ void tracing_snapshot(void)
 	update_max_tr(tr, current, smp_processor_id());
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(tracing_snapshot);
 
 static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
 					struct trace_buffer *size_buf, int cpu_id);
@@ -493,16 +500,19 @@ void tracing_snapshot_alloc(void)
 
 	tracing_snapshot();
 }
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
 #else
 void tracing_snapshot(void)
 {
 	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
 }
+EXPORT_SYMBOL_GPL(tracing_snapshot);
 void tracing_snapshot_alloc(void)
 {
 	/* Give warning */
 	tracing_snapshot();
 }
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
 /**
-- 
cgit v1.2.3


From 1c31714328be90764e46716f31fb0bd6da44c305 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Sat, 9 Mar 2013 08:36:53 -0500
Subject: tracing: Consolidate updating of count for traceon/off

Remove some duplicate code and replace it with a helper function.
This makes the code a it cleaner.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e467c0c7bdd5..38cfb290ecd9 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -214,38 +214,37 @@ static struct tracer function_trace __read_mostly =
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+static int update_count(void **data)
 {
-	long *count = (long *)data;
-
-	if (tracing_is_on())
-		return;
+	unsigned long *count = (long *)data;
 
 	if (!*count)
-		return;
+		return 0;
 
 	if (*count != -1)
 		(*count)--;
 
-	tracing_on();
+	return 1;
 }
 
 static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
 {
-	long *count = (long *)data;
-
-	if (!tracing_is_on())
+	if (tracing_is_on())
 		return;
 
-	if (!*count)
-		return;
+	if (update_count(data))
+		tracing_on();
+}
 
-	if (*count != -1)
-		(*count)--;
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	if (!tracing_is_on())
+		return;
 
-	tracing_off();
+	if (update_count(data))
+		tracing_off();
 }
 
 static int
-- 
cgit v1.2.3


From 8b8fa62c60e03a53c46324075a8dc25821741daa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 09:25:00 -0400
Subject: tracing: Consolidate ftrace_trace_onoff_unreg() into callback

The only thing ftrace_trace_onoff_unreg() does is to do a strcmp()
against the cmd parameter to determine what op to unregister. But
this compare is also done after the location that this function is
called (and returns). By moving the check for '!' to unregister after
the strcmp(), the callback function itself can just do the unregister
and we can get rid of the helper function.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38cfb290ecd9..a88a3e0b0cc2 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -282,22 +282,6 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	return 0;
 }
 
-static int
-ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
-{
-	struct ftrace_probe_ops *ops;
-
-	/* we register both traceon and traceoff to this callback */
-	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_probe_ops;
-	else
-		ops = &traceoff_probe_ops;
-
-	unregister_ftrace_function_probe_func(glob, ops);
-
-	return 0;
-}
-
 static int
 ftrace_trace_onoff_callback(struct ftrace_hash *hash,
 			    char *glob, char *cmd, char *param, int enable)
@@ -311,15 +295,17 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
 	if (!enable)
 		return -EINVAL;
 
-	if (glob[0] == '!')
-		return ftrace_trace_onoff_unreg(glob+1, cmd, param);
-
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
 		ops = &traceon_probe_ops;
 	else
 		ops = &traceoff_probe_ops;
 
+	if (glob[0] == '!') {
+		unregister_ftrace_function_probe_func(glob+1, ops);
+		return 0;
+	}
+
 	if (!param)
 		goto out_reg;
 
-- 
cgit v1.2.3


From 8380d24860e9d1659ab22896b86d7fe591c424fa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Sat, 9 Mar 2013 08:56:43 -0500
Subject: ftrace: Separate unlimited probes from count limited probes

The function tracing probes that trigger traceon or traceoff can be
set to unlimited, or given a count of # of times to execute.

By separating these two types of probes, we can then use the dynamic
ftrace function filtering directly, and remove the brute force
"check if this function called is my probe" routines in ftrace.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a88a3e0b0cc2..043b2425ae73 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -228,7 +228,7 @@ static int update_count(void **data)
 }
 
 static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
 	if (tracing_is_on())
 		return;
@@ -238,7 +238,7 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
 }
 
 static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
 	if (!tracing_is_on())
 		return;
@@ -247,10 +247,38 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 		tracing_off();
 }
 
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	if (tracing_is_on())
+		return;
+
+	tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	if (!tracing_is_on())
+		return;
+
+	tracing_off();
+}
+
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 			 struct ftrace_probe_ops *ops, void *data);
 
+static struct ftrace_probe_ops traceon_count_probe_ops = {
+	.func			= ftrace_traceon_count,
+	.print			= ftrace_trace_onoff_print,
+};
+
+static struct ftrace_probe_ops traceoff_count_probe_ops = {
+	.func			= ftrace_traceoff_count,
+	.print			= ftrace_trace_onoff_print,
+};
+
 static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
@@ -269,7 +297,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 
 	seq_printf(m, "%ps:", (void *)ip);
 
-	if (ops == &traceon_probe_ops)
+	if (ops == &traceon_probe_ops || ops == &traceon_count_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -297,9 +325,9 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_probe_ops;
+		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
 	else
-		ops = &traceoff_probe_ops;
+		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
 
 	if (glob[0] == '!') {
 		unregister_ftrace_function_probe_func(glob+1, ops);
-- 
cgit v1.2.3


From e1df4cb682ab2c3c2981c8efa4aec044e61f4e06 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 10:09:42 -0400
Subject: ftrace: Fix function probe to only enable needed functions

Currently the function probe enables all functions and runs a "hash"
against every function call to see if it should call a probe. This
is extremely wasteful.

Note, a probe is something like:

  echo schedule:traceoff > /debug/tracing/set_ftrace_filter

When schedule is called, the probe will disable tracing. But currently,
it has a call back for *all* functions, and checks to see if the
called function is the probe that is needed.

The probe function has been created before ftrace was rewritten to
allow for more than one "op" to be registered by the function tracer.
When probes were created, it couldn't limit the functions without also
limiting normal function calls. But now we can, it's about time
to update the probe code.

Todo, have separate ops for different entries. That is, assign
a ftrace_ops per probe, instead of one op for all probes. But
as there's not many probes assigned, this may not be that urgent.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e6effd0c40a9..dab031fec85b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2988,18 +2988,20 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 	kfree(entry);
 }
 
-
 int
 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
 	struct ftrace_func_probe *entry;
+	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+	struct ftrace_hash *hash;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
 	unsigned long key;
 	int count = 0;
 	char *search;
+	int ret;
 
 	type = filter_parse_regex(glob, strlen(glob), &search, &not);
 	len = strlen(search);
@@ -3010,8 +3012,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	mutex_lock(&ftrace_lock);
 
-	if (unlikely(ftrace_disabled))
+	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+	if (!hash) {
+		count = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (unlikely(ftrace_disabled)) {
+		count = -ENODEV;
 		goto out_unlock;
+	}
 
 	do_for_each_ftrace_rec(pg, rec) {
 
@@ -3043,6 +3053,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			}
 		}
 
+		ret = enter_record(hash, rec, 0);
+		if (ret < 0) {
+			kfree(entry);
+			count = ret;
+			goto out_unlock;
+		}
+
 		entry->ops = ops;
 		entry->ip = rec->ip;
 
@@ -3050,10 +3067,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
+
+	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+	if (ret < 0)
+		count = ret;
+
 	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
+	free_ftrace_hash(hash);
 
 	return count;
 }
@@ -3067,7 +3090,10 @@ static void
 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
+	struct ftrace_func_entry *rec_entry;
 	struct ftrace_func_probe *entry;
+	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+	struct ftrace_hash *hash;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -3088,6 +3114,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	}
 
 	mutex_lock(&ftrace_lock);
+
+	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+	if (!hash)
+		/* Hmm, should report this somehow */
+		goto out_unlock;
+
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
 		struct hlist_head *hhd = &ftrace_func_hash[i];
 
@@ -3108,12 +3140,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 					continue;
 			}
 
+			rec_entry = ftrace_lookup_ip(hash, entry->ip);
+			/* It is possible more than one entry had this ip */
+			if (rec_entry)
+				free_hash_entry(hash, rec_entry);
+
 			hlist_del_rcu(&entry->node);
 			call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
 	__disable_ftrace_function_probe();
+	/*
+	 * Remove after the disable is called. Otherwise, if the last
+	 * probe is removed, a null hash means *all enabled*.
+	 */
+	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ out_unlock:
 	mutex_unlock(&ftrace_lock);
+	free_ftrace_hash(hash);
 }
 
 void
-- 
cgit v1.2.3


From 3209cff4490bee55fd2bc1d087cb8ecf2a686a88 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 11:17:54 -0400
Subject: tracing: Add alloc/free_snapshot() to replace duplicate code

Add alloc_snapshot() and free_snapshot() to allocate and free the
snapshot buffer respectively, and use these to remove duplicate
code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 79 ++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c53e4092269..906049c0af90 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -149,14 +149,14 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
-static int __init alloc_snapshot(char *str)
+static int __init boot_alloc_snapshot(char *str)
 {
 	allocate_snapshot = true;
 	/* We also need the main ring buffer expanded */
 	ring_buffer_expanded = true;
 	return 1;
 }
-__setup("alloc_snapshot", alloc_snapshot);
+__setup("alloc_snapshot", boot_alloc_snapshot);
 
 
 static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
@@ -470,6 +470,38 @@ EXPORT_SYMBOL_GPL(tracing_snapshot);
 
 static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
 					struct trace_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+
+static int alloc_snapshot(struct trace_array *tr)
+{
+	int ret;
+
+	if (!tr->allocated_snapshot) {
+
+		/* allocate spare buffer */
+		ret = resize_buffer_duplicate_size(&tr->max_buffer,
+				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+		if (ret < 0)
+			return ret;
+
+		tr->allocated_snapshot = true;
+	}
+
+	return 0;
+}
+
+void free_snapshot(struct trace_array *tr)
+{
+	/*
+	 * We don't free the ring buffer. instead, resize it because
+	 * The max_tr ring buffer has some state (e.g. ring->clock) and
+	 * we want preserve it.
+	 */
+	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+	set_buffer_entries(&tr->max_buffer, 1);
+	tracing_reset_online_cpus(&tr->max_buffer);
+	tr->allocated_snapshot = false;
+}
 
 /**
  * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
@@ -487,16 +519,9 @@ void tracing_snapshot_alloc(void)
 	struct trace_array *tr = &global_trace;
 	int ret;
 
-	if (!tr->allocated_snapshot) {
-
-		/* allocate spare buffer */
-		ret = resize_buffer_duplicate_size(&tr->max_buffer,
-				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
-		if (WARN_ON(ret < 0))
-			return;
-
-		tr->allocated_snapshot = true;
-	}
+	ret = alloc_snapshot(tr);
+	if (WARN_ON(ret < 0))
+		return;
 
 	tracing_snapshot();
 }
@@ -3581,15 +3606,7 @@ static int tracing_set_tracer(const char *buf)
 		 * so a synchronized_sched() is sufficient.
 		 */
 		synchronize_sched();
-		/*
-		 * We don't free the ring buffer. instead, resize it because
-		 * The max_tr ring buffer has some state (e.g. ring->clock) and
-		 * we want preserve it.
-		 */
-		ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
-		set_buffer_entries(&tr->max_buffer, 1);
-		tracing_reset_online_cpus(&tr->max_buffer);
-		tr->allocated_snapshot = false;
+		free_snapshot(tr);
 	}
 #endif
 	destroy_trace_option_files(topts);
@@ -3598,12 +3615,9 @@ static int tracing_set_tracer(const char *buf)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
-		/* we need to make per cpu buffer sizes equivalent */
-		ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer,
-						   RING_BUFFER_ALL_CPUS);
+		ret = alloc_snapshot(tr);
 		if (ret < 0)
 			goto out;
-		tr->allocated_snapshot = true;
 	}
 #endif
 
@@ -4475,14 +4489,8 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 			ret = -EINVAL;
 			break;
 		}
-		if (tr->allocated_snapshot) {
-			/* free spare buffer */
-			ring_buffer_resize(tr->max_buffer.buffer, 1,
-					   RING_BUFFER_ALL_CPUS);
-			set_buffer_entries(&tr->max_buffer, 1);
-			tracing_reset_online_cpus(&tr->max_buffer);
-			tr->allocated_snapshot = false;
-		}
+		if (tr->allocated_snapshot)
+			free_snapshot(tr);
 		break;
 	case 1:
 /* Only allow per-cpu swap if the ring buffer supports it */
@@ -4493,12 +4501,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		}
 #endif
 		if (!tr->allocated_snapshot) {
-			/* allocate spare buffer */
-			ret = resize_buffer_duplicate_size(&tr->max_buffer,
-					&tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+			ret = alloc_snapshot(tr);
 			if (ret < 0)
 				break;
-			tr->allocated_snapshot = true;
 		}
 		local_irq_disable();
 		/* Now, we're going to swap */
-- 
cgit v1.2.3


From 77fd5c15e3216b901be69047ca43b05ae9099951 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 11:49:18 -0400
Subject: tracing: Add snapshot trigger to function probes

 echo 'schedule:snapshot:1' > /debug/tracing/set_ftrace_filter

This will cause the scheduler to trigger a snapshot the next time
it's called (you can use any function that's not called by NMI).

Even though it triggers only once, you still need to remove it with:

 echo '!schedule:snapshot:0' > /debug/tracing/set_ftrace_filter

The :1 can be left off for the first command:

 echo 'schedule:snapshot' > /debug/tracing/set_ftrace_filter

But this will cause all calls to schedule to trigger a snapshot.
This must be removed without the ':0'

 echo '!schedule:snapshot' > /debug/tracing/set_ftrace_filter

As adding a "count" is a different operation (internally).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 906049c0af90..c5b844621562 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5086,7 +5086,114 @@ static const struct file_operations tracing_dyn_info_fops = {
 	.read		= tracing_read_dyn_info,
 	.llseek		= generic_file_llseek,
 };
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	tracing_snapshot();
+}
+
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	unsigned long *count = (long *)data;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_snapshot();
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+		      struct ftrace_probe_ops *ops, void *data)
+{
+	long count = (long)data;
+
+	seq_printf(m, "%ps:", (void *)ip);
+
+	seq_printf(m, "snapshot");
+
+	if (count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
+		seq_printf(m, ":count=%ld\n", count);
+
+	return 0;
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+	.func			= ftrace_snapshot,
+	.print			= ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+	.func			= ftrace_count_snapshot,
+	.print			= ftrace_snapshot_print,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+			       char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_probe_ops *ops;
+	void *count = (void *)-1;
+	char *number;
+	int ret;
+
+	/* hash funcs only work with set_ftrace_filter */
+	if (!enable)
+		return -EINVAL;
+
+	ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops;
+
+	if (glob[0] == '!') {
+		unregister_ftrace_function_probe_func(glob+1, ops);
+		return 0;
+	}
+
+	if (!param)
+		goto out_reg;
+
+	number = strsep(&param, ":");
+
+	if (!strlen(number))
+		goto out_reg;
+
+	/*
+	 * We use the callback data field (which is a pointer)
+	 * as our counter.
+	 */
+	ret = kstrtoul(number, 0, (unsigned long *)&count);
+	if (ret)
+		return ret;
+
+ out_reg:
+	ret = register_ftrace_function_probe(glob, ops, count);
+
+	if (ret >= 0)
+		alloc_snapshot(&global_trace);
+
+	return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+	.name			= "snapshot",
+	.func			= ftrace_trace_snapshot_callback,
+};
+
+static int register_snapshot_cmd(void)
+{
+	return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#else
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
 
 struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
 {
@@ -6076,6 +6183,8 @@ __init static int tracer_alloc_buffers(void)
 		trace_set_options(&global_trace, option);
 	}
 
+	register_snapshot_cmd();
+
 	return 0;
 
 out_free_cpumask:
-- 
cgit v1.2.3


From e67efb93f0e9130174293ffaa5975f87b301b531 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 15:07:59 -0400
Subject: ftrace: Clean up function probe methods

When a function probe is created, each function that the probe is
attached to, a "callback" method is called. On release of the probe,
each function entry calls the "free" method.

First, "callback" is a confusing name and does not really match what
it does. Callback sounds like it will be called when the probe
triggers. But that's not the case. This is really an "init" function,
so lets rename it as such.

Secondly, both "init" and "free" do not pass enough information back
to the handlers. Pass back the ops, ip and data for each time the
method is called. We have the information, might as well use it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dab031fec85b..ff0ef41c6d93 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2984,7 +2984,7 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
-		entry->ops->free(&entry->data);
+		entry->ops->free(entry->ops, entry->ip, &entry->data);
 	kfree(entry);
 }
 
@@ -3045,8 +3045,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		 * for each function we find. We call the callback
 		 * to give the caller an opportunity to do so.
 		 */
-		if (ops->callback) {
-			if (ops->callback(rec->ip, &entry->data) < 0) {
+		if (ops->init) {
+			if (ops->init(ops, rec->ip, &entry->data) < 0) {
 				/* caller does not like this func */
 				kfree(entry);
 				continue;
-- 
cgit v1.2.3


From 7818b3886545f89549185e4023743e2df91d1fa1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 12:42:58 -0400
Subject: ftrace: Use manual free after synchronize_sched() not
 call_rcu_sched()

The entries to the probe hash must be freed after a synchronize_sched()
after the entry has been removed from the hash.

As the entries are registered with ops that may have their own callbacks,
and these callbacks may sleep, we can not use call_rcu_sched() because
the rcu callbacks registered with that are called from a softirq context.

Instead of using call_rcu_sched(), manually save the entries on a free_list
and at the end of the loop that removes the entries, do a synchronize_sched()
and then go through the free_list, freeing the entries.

Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ff0ef41c6d93..25770824598f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1068,7 +1068,7 @@ struct ftrace_func_probe {
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
-	struct rcu_head		rcu;
+	struct list_head	free_list;
 };
 
 struct ftrace_func_entry {
@@ -2978,11 +2978,8 @@ static void __disable_ftrace_function_probe(void)
 }
 
 
-static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+static void ftrace_free_entry(struct ftrace_func_probe *entry)
 {
-	struct ftrace_func_probe *entry =
-		container_of(rhp, struct ftrace_func_probe, rcu);
-
 	if (entry->ops->free)
 		entry->ops->free(entry->ops, entry->ip, &entry->data);
 	kfree(entry);
@@ -3092,7 +3089,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 {
 	struct ftrace_func_entry *rec_entry;
 	struct ftrace_func_probe *entry;
+	struct ftrace_func_probe *p;
 	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+	struct list_head free_list;
 	struct ftrace_hash *hash;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
@@ -3120,6 +3119,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		/* Hmm, should report this somehow */
 		goto out_unlock;
 
+	INIT_LIST_HEAD(&free_list);
+
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
 		struct hlist_head *hhd = &ftrace_func_hash[i];
 
@@ -3146,7 +3147,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				free_hash_entry(hash, rec_entry);
 
 			hlist_del_rcu(&entry->node);
-			call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
+			list_add(&entry->free_list, &free_list);
 		}
 	}
 	__disable_ftrace_function_probe();
@@ -3155,6 +3156,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	 * probe is removed, a null hash means *all enabled*.
 	 */
 	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+	synchronize_sched();
+	list_for_each_entry_safe(entry, p, &free_list, free_list) {
+		list_del(&entry->free_list);
+		ftrace_free_entry(entry);
+	}
+		
  out_unlock:
 	mutex_unlock(&ftrace_lock);
 	free_ftrace_hash(hash);
-- 
cgit v1.2.3


From 417944c4c7a0f657158d0515f3b8e8c043fd788f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 13:26:18 -0400
Subject: tracing: Add a way to soft disable trace events

In order to let triggers enable or disable events, we need a 'soft'
method for doing so. For example, if a function probe is added that
lets a user enable or disable events when a function is called, that
change must be done without taking locks or a mutex, and definitely
it can't sleep. But the full enabling of a tracepoint is expensive.

By adding a 'SOFT_DISABLE' flag, and converting the flags to be updated
without the protection of a mutex (using set/clear_bit()), this soft
disable flag can be used to allow critical sections to enable or disable
events from being traced (after the event has been placed into "SOFT_MODE").

Some caveats though: The comm recorder (to map pids with a comm) can not
be soft disabled (yet). If you disable an event with with a "soft"
disable and wait a while before reading the trace, the comm cache may be
replaced and you'll get a bunch of <...> for comms in the trace.

Reading the "enable" file for an event that is disabled will now give
you "0*" where the '*' denotes that the tracepoint is still active but
the event itself is "disabled".

[ fixed _BIT used in & operation : thanks to Dan Carpenter and smatch ]

Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 75 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 13 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 38b54c5edeb9..106640b0df4a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -205,37 +205,77 @@ void trace_event_enable_cmd_record(bool enable)
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
+			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 		} else {
 			tracing_stop_cmdline_record();
-			file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
+			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 		}
 	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
-				       int enable)
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+					 int enable, int soft_disable)
 {
 	struct ftrace_event_call *call = file->event_call;
 	int ret = 0;
+	int disable;
 
 	switch (enable) {
 	case 0:
-		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
-			file->flags &= ~FTRACE_EVENT_FL_ENABLED;
+		/*
+		 * When soft_disable is set and enable is cleared, we want
+		 * to clear the SOFT_DISABLED flag but leave the event in the
+		 * state that it was. That is, if the event was enabled and
+		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+		 * is set we do not want the event to be enabled before we
+		 * clear the bit.
+		 *
+		 * When soft_disable is not set but the SOFT_MODE flag is,
+		 * we do nothing. Do not disable the tracepoint, otherwise
+		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+		 */
+		if (soft_disable) {
+			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+		} else
+			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+
+		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
 			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
+				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
+		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
+		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
 		break;
 	case 1:
+		/*
+		 * When soft_disable is set and enable is set, we want to
+		 * register the tracepoint for the event, but leave the event
+		 * as is. That means, if the event was already enabled, we do
+		 * nothing (but set SOFT_MODE). If the event is disabled, we
+		 * set SOFT_DISABLED before enabling the event tracepoint, so
+		 * it still seems to be disabled.
+		 */
+		if (!soft_disable)
+			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+		else
+			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+
 		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+
+			/* Keep the event disabled, when going to SOFT_MODE. */
+			if (soft_disable)
+				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
+				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
@@ -244,7 +284,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 					"%s\n", call->name);
 				break;
 			}
-			file->flags |= FTRACE_EVENT_FL_ENABLED;
+			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
 
 			/* WAS_ENABLED gets set but never cleared. */
 			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -255,6 +295,12 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 	return ret;
 }
 
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+				       int enable)
+{
+	return __ftrace_event_enable_disable(file, enable, 0);
+}
+
 static void ftrace_clear_events(struct trace_array *tr)
 {
 	struct ftrace_event_file *file;
@@ -547,12 +593,15 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_file *file = filp->private_data;
 	char *buf;
 
-	if (file->flags & FTRACE_EVENT_FL_ENABLED)
-		buf = "1\n";
-	else
+	if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
+			buf = "0*\n";
+		else
+			buf = "1\n";
+	} else
 		buf = "0\n";
 
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
 }
 
 static ssize_t
-- 
cgit v1.2.3


From 3cd715de261182413b3487abfffe1b6af41b81b3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 19:35:13 -0400
Subject: tracing: Add function probe triggers to enable/disable events

Add triggers to function tracer that lets an event get enabled or
disabled when a function is called:

format is:

 <function>:enable_event:<system>:<event>[:<count>]
 <function>:disable_event:<system>:<event>[:<count>]

 echo 'schedule:enable_event:sched:sched_switch' > /debug/tracing/set_ftrace_filter

Every time schedule is called, it will enable the sched_switch event.

 echo 'schedule:disable_event:sched:sched_switch:2' > /debug/tracing/set_ftrace_filter

The first two times schedule is called while the sched_switch
event is enabled, it will disable it. It will not count for a time
that the event is already disabled (or enabled for enable_event).

[ fixed return without mutex_unlock() - thanks to Dan Carpenter and smatch ]

Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 279 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 106640b0df4a..c636523b1a59 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1798,6 +1798,283 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR	"enable_event"
+#define DISABLE_EVENT_STR	"disable_event"
+
+struct event_probe_data {
+	struct ftrace_event_file	*file;
+	unsigned long			count;
+	int				ref;
+	bool				enable;
+};
+
+static struct ftrace_event_file *
+find_event_file(struct trace_array *tr, const char *system,  const char *event)
+{
+	struct ftrace_event_file *file;
+	struct ftrace_event_call *call;
+
+	list_for_each_entry(file, &tr->events, list) {
+
+		call = file->event_call;
+
+		if (!call->name || !call->class || !call->class->reg)
+			continue;
+
+		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+			continue;
+
+		if (strcmp(event, call->name) == 0 &&
+		    strcmp(system, call->class->system) == 0)
+			return file;
+	}
+	return NULL;
+}
+
+static void
+event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+	struct event_probe_data **pdata = (struct event_probe_data **)_data;
+	struct event_probe_data *data = *pdata;
+
+	if (!data)
+		return;
+
+	if (data->enable)
+		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+	else
+		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+}
+
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+	struct event_probe_data **pdata = (struct event_probe_data **)_data;
+	struct event_probe_data *data = *pdata;
+
+	if (!data)
+		return;
+
+	if (!data->count)
+		return;
+
+	/* Skip if the event is in a state we want to switch to */
+	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+		return;
+
+	if (data->count != -1)
+		(data->count)--;
+
+	event_enable_probe(ip, parent_ip, _data);
+}
+
+static int
+event_enable_print(struct seq_file *m, unsigned long ip,
+		      struct ftrace_probe_ops *ops, void *_data)
+{
+	struct event_probe_data *data = _data;
+
+	seq_printf(m, "%ps:", (void *)ip);
+
+	seq_printf(m, "%s:%s:%s",
+		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+		   data->file->event_call->class->system,
+		   data->file->event_call->name);
+
+	if (data->count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
+		seq_printf(m, ":count=%ld\n", data->count);
+
+	return 0;
+}
+
+static int
+event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+		  void **_data)
+{
+	struct event_probe_data **pdata = (struct event_probe_data **)_data;
+	struct event_probe_data *data = *pdata;
+
+	data->ref++;
+	return 0;
+}
+
+static void
+event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+		  void **_data)
+{
+	struct event_probe_data **pdata = (struct event_probe_data **)_data;
+	struct event_probe_data *data = *pdata;
+
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	data->ref--;
+	if (!data->ref) {
+		/* Remove the SOFT_MODE flag */
+		__ftrace_event_enable_disable(data->file, 0, 1);
+		module_put(data->file->event_call->mod);
+		kfree(data);
+	}
+	*pdata = NULL;
+}
+
+static struct ftrace_probe_ops event_enable_probe_ops = {
+	.func			= event_enable_probe,
+	.print			= event_enable_print,
+	.init			= event_enable_init,
+	.free			= event_enable_free,
+};
+
+static struct ftrace_probe_ops event_enable_count_probe_ops = {
+	.func			= event_enable_count_probe,
+	.print			= event_enable_print,
+	.init			= event_enable_init,
+	.free			= event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_probe_ops = {
+	.func			= event_enable_probe,
+	.print			= event_enable_print,
+	.init			= event_enable_init,
+	.free			= event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_count_probe_ops = {
+	.func			= event_enable_count_probe,
+	.print			= event_enable_print,
+	.init			= event_enable_init,
+	.free			= event_enable_free,
+};
+
+static int
+event_enable_func(struct ftrace_hash *hash,
+		  char *glob, char *cmd, char *param, int enabled)
+{
+	struct trace_array *tr = top_trace_array();
+	struct ftrace_event_file *file;
+	struct ftrace_probe_ops *ops;
+	struct event_probe_data *data;
+	const char *system;
+	const char *event;
+	char *number;
+	bool enable;
+	int ret;
+
+	/* hash funcs only work with set_ftrace_filter */
+	if (!enabled)
+		return -EINVAL;
+
+	if (!param)
+		return -EINVAL;
+
+	system = strsep(&param, ":");
+	if (!param)
+		return -EINVAL;
+
+	event = strsep(&param, ":");
+
+	mutex_lock(&event_mutex);
+
+	ret = -EINVAL;
+	file = find_event_file(tr, system, event);
+	if (!file)
+		goto out;
+
+	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+	if (enable)
+		ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
+	else
+		ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
+
+	if (glob[0] == '!') {
+		unregister_ftrace_function_probe_func(glob+1, ops);
+		ret = 0;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out;
+
+	data->enable = enable;
+	data->count = -1;
+	data->file = file;
+
+	if (!param)
+		goto out_reg;
+
+	number = strsep(&param, ":");
+
+	ret = -EINVAL;
+	if (!strlen(number))
+		goto out_free;
+
+	/*
+	 * We use the callback data field (which is a pointer)
+	 * as our counter.
+	 */
+	ret = kstrtoul(number, 0, &data->count);
+	if (ret)
+		goto out_free;
+
+ out_reg:
+	/* Don't let event modules unload while probe registered */
+	ret = try_module_get(file->event_call->mod);
+	if (!ret)
+		goto out_free;
+
+	ret = __ftrace_event_enable_disable(file, 1, 1);
+	if (ret < 0)
+		goto out_put;
+	ret = register_ftrace_function_probe(glob, ops, data);
+	if (!ret)
+		goto out_disable;
+ out:
+	mutex_unlock(&event_mutex);
+	return ret;
+
+ out_disable:
+	__ftrace_event_enable_disable(file, 0, 1);
+ out_put:
+	module_put(file->event_call->mod);
+ out_free:
+	kfree(data);
+	goto out;
+}
+
+static struct ftrace_func_command event_enable_cmd = {
+	.name			= ENABLE_EVENT_STR,
+	.func			= event_enable_func,
+};
+
+static struct ftrace_func_command event_disable_cmd = {
+	.name			= DISABLE_EVENT_STR,
+	.func			= event_enable_func,
+};
+
+static __init int register_event_cmds(void)
+{
+	int ret;
+
+	ret = register_ftrace_command(&event_enable_cmd);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = register_ftrace_command(&event_disable_cmd);
+	if (WARN_ON(ret < 0))
+		unregister_ftrace_command(&event_enable_cmd);
+	return ret;
+}
+#else
+static inline int register_event_cmds(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 /*
  * The top level array has already had its ftrace_event_file
  * descriptors created in order to allow for early events to
@@ -2058,6 +2335,8 @@ static __init int event_trace_enable(void)
 
 	trace_printk_start_comm();
 
+	register_event_cmds();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c142be8ebe0b7bf73c8a0063925623f3e4b980c0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 09:55:57 -0400
Subject: tracing: Add skip argument to trace_dump_stack()

Altough the trace_dump_stack() already skips three functions in
the call to stack trace, which gets the stack trace to start
at the caller of the function, the caller may want to skip some
more too (as it may have helper functions).

Add a skip argument to the trace_dump_stack() that lets the caller
skip back tracing functions that it doesn't care about.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c5b844621562..8aa53213201f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,8 +1657,9 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 
 /**
  * trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
  */
-void trace_dump_stack(void)
+void trace_dump_stack(int skip)
 {
 	unsigned long flags;
 
@@ -1667,9 +1668,13 @@ void trace_dump_stack(void)
 
 	local_save_flags(flags);
 
-	/* skipping 3 traces, seems to get us at the caller of this function */
-	__ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3,
-			     preempt_count(), NULL);
+	/*
+	 * Skip 3 more, seems to get us at the caller of
+	 * this function.
+	 */
+	skip += 3;
+	__ftrace_trace_stack(global_trace.trace_buffer.buffer,
+			     flags, skip, preempt_count(), NULL);
 }
 
 static DEFINE_PER_CPU(int, user_stack_count);
-- 
cgit v1.2.3


From dd42cd3ea96d687f15525c4f14fa582702db223f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 10:17:50 -0400
Subject: tracing: Add function probe to trigger stack traces

Add a function probe that will cause a stack trace to be traced in
the ring buffer when the given function(s) are called.

format is:

 <function>:stacktrace[:<count>]

 echo 'schedule:stacktrace' > /debug/tracing/set_ftrace_filter
 cat /debug/tracing/trace_pipe
     kworker/2:0-4329  [002] ...2  2933.558007: <stack trace>
 => kthread
 => ret_from_fork
          <idle>-0     [000] .N.2  2933.558019: <stack trace>
 => rest_init
 => start_kernel
 => x86_64_start_reservations
 => x86_64_start_kernel
     kworker/2:0-4329  [002] ...2  2933.558109: <stack trace>
 => kthread
 => ret_from_fork
[...]

This can be set to only trace a specific amount of times:

 echo 'schedule:stacktrace:3' > /debug/tracing/set_ftrace_filter
 cat /debug/tracing/trace_pipe
           <...>-58    [003] ...2   841.801694: <stack trace>
 => kthread
 => ret_from_fork
          <idle>-0     [001] .N.2   841.801697: <stack trace>
 => start_secondary
           <...>-2059  [001] ...2   841.801736: <stack trace>
 => wait_for_common
 => wait_for_completion
 => flush_work
 => tty_flush_to_ldisc
 => input_available_p
 => n_tty_poll
 => tty_poll
 => do_select
 => core_sys_select
 => sys_select
 => system_call_fastpath

To remove these:

 echo '!schedule:stacktrace' > /debug/tracing/set_ftrace_filter
 echo '!schedule:stacktrace:0' > /debug/tracing/set_ftrace_filter

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions.c | 150 +++++++++++++++++++++++++++++++----------
 1 file changed, 115 insertions(+), 35 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 043b2425ae73..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -265,56 +265,103 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 	tracing_off();
 }
 
+/*
+ * Skip 4:
+ *   ftrace_stacktrace()
+ *   function_trace_probe_call()
+ *   ftrace_ops_list_func()
+ *   ftrace_call()
+ */
+#define STACK_SKIP 4
+
+static void
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	trace_dump_stack(STACK_SKIP);
+}
+
+static void
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	if (!tracing_is_on())
+		return;
+
+	if (update_count(data))
+		trace_dump_stack(STACK_SKIP);
+}
+
 static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_probe_ops *ops, void *data);
+ftrace_probe_print(const char *name, struct seq_file *m,
+		   unsigned long ip, void *data)
+{
+	long count = (long)data;
+
+	seq_printf(m, "%ps:%s", (void *)ip, name);
+
+	if (count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
+		seq_printf(m, ":count=%ld\n", count);
+
+	return 0;
+}
+
+static int
+ftrace_traceon_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_probe_ops *ops, void *data)
+{
+	return ftrace_probe_print("traceon", m, ip, data);
+}
+
+static int
+ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_probe_ops *ops, void *data)
+{
+	return ftrace_probe_print("traceoff", m, ip, data);
+}
+
+static int
+ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
+			struct ftrace_probe_ops *ops, void *data)
+{
+	return ftrace_probe_print("stacktrace", m, ip, data);
+}
 
 static struct ftrace_probe_ops traceon_count_probe_ops = {
 	.func			= ftrace_traceon_count,
-	.print			= ftrace_trace_onoff_print,
+	.print			= ftrace_traceon_print,
 };
 
 static struct ftrace_probe_ops traceoff_count_probe_ops = {
 	.func			= ftrace_traceoff_count,
-	.print			= ftrace_trace_onoff_print,
+	.print			= ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_count_probe_ops = {
+	.func			= ftrace_stacktrace_count,
+	.print			= ftrace_stacktrace_print,
 };
 
 static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
-	.print			= ftrace_trace_onoff_print,
+	.print			= ftrace_traceon_print,
 };
 
 static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
-	.print			= ftrace_trace_onoff_print,
+	.print			= ftrace_traceoff_print,
 };
 
-static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_probe_ops *ops, void *data)
-{
-	long count = (long)data;
-
-	seq_printf(m, "%ps:", (void *)ip);
-
-	if (ops == &traceon_probe_ops || ops == &traceon_count_probe_ops)
-		seq_printf(m, "traceon");
-	else
-		seq_printf(m, "traceoff");
-
-	if (count == -1)
-		seq_printf(m, ":unlimited\n");
-	else
-		seq_printf(m, ":count=%ld\n", count);
-
-	return 0;
-}
+static struct ftrace_probe_ops stacktrace_probe_ops = {
+	.func			= ftrace_stacktrace,
+	.print			= ftrace_stacktrace_print,
+};
 
 static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
-			    char *glob, char *cmd, char *param, int enable)
+ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+			    struct ftrace_hash *hash, char *glob,
+			    char *cmd, char *param, int enable)
 {
-	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -323,12 +370,6 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
 	if (!enable)
 		return -EINVAL;
 
-	/* we register both traceon and traceoff to this callback */
-	if (strcmp(cmd, "traceon") == 0)
-		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
-	else
-		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
-
 	if (glob[0] == '!') {
 		unregister_ftrace_function_probe_func(glob+1, ops);
 		return 0;
@@ -356,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
 	return ret < 0 ? ret : 0;
 }
 
+static int
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+			    char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_probe_ops *ops;
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
+	else
+		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
+
+	return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+					   param, enable);
+}
+
+static int
+ftrace_stacktrace_callback(struct ftrace_hash *hash,
+			   char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_probe_ops *ops;
+
+	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
+
+	return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+					   param, enable);
+}
+
 static struct ftrace_func_command ftrace_traceon_cmd = {
 	.name			= "traceon",
 	.func			= ftrace_trace_onoff_callback,
@@ -366,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
 	.func			= ftrace_trace_onoff_callback,
 };
 
+static struct ftrace_func_command ftrace_stacktrace_cmd = {
+	.name			= "stacktrace",
+	.func			= ftrace_stacktrace_callback,
+};
+
 static int __init init_func_cmd_traceon(void)
 {
 	int ret;
@@ -377,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
 	ret = register_ftrace_command(&ftrace_traceon_cmd);
 	if (ret)
 		unregister_ftrace_command(&ftrace_traceoff_cmd);
+
+	ret = register_ftrace_command(&ftrace_stacktrace_cmd);
+	if (ret) {
+		unregister_ftrace_command(&ftrace_traceoff_cmd);
+		unregister_ftrace_command(&ftrace_traceon_cmd);
+	}
 	return ret;
 }
 #else
-- 
cgit v1.2.3


From 87889501d0adfae10e3b0f0e6f2d7536eed9ae84 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 20:43:57 -0400
Subject: tracing: Use stack of calling function for stack tracer

Use the stack of stack_trace_call() instead of check_stack() as
the test pointer for max stack size. It makes it a bit cleaner
and a little more accurate.

Adding stable, as a later fix depends on this patch.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..dc02e29d8255 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -39,20 +39,21 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
 
-static inline void check_stack(void)
+static inline void
+check_stack(unsigned long *stack)
 {
 	unsigned long this_size, flags;
 	unsigned long *p, *top, *start;
 	int i;
 
-	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
 	this_size = THREAD_SIZE - this_size;
 
 	if (this_size <= max_stack_size)
 		return;
 
 	/* we do not handle interrupt stacks yet */
-	if (!object_is_on_stack(&this_size))
+	if (!object_is_on_stack(stack))
 		return;
 
 	local_irq_save(flags);
@@ -73,7 +74,7 @@ static inline void check_stack(void)
 	 * Now find where in the stack these are.
 	 */
 	i = 0;
-	start = &this_size;
+	start = stack;
 	top = (unsigned long *)
 		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
 
@@ -113,6 +114,7 @@ static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip,
 		 struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
+	unsigned long stack;
 	int cpu;
 
 	preempt_disable_notrace();
@@ -122,7 +124,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (per_cpu(trace_active, cpu)++ != 0)
 		goto out;
 
-	check_stack();
+	check_stack(&stack);
 
  out:
 	per_cpu(trace_active, cpu)--;
-- 
cgit v1.2.3


From d4ecbfc49b4b1d4b597fb5ba9e4fa25d62f105c5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 21:25:35 -0400
Subject: tracing: Fix stack tracer with fentry use

When gcc 4.6 on x86 is used, the function tracer will use the new
option -mfentry which does a call to "fentry" at every function
instead of "mcount". The significance of this is that fentry is
called as the first operation of the function instead of the mcount
usage of being called after the stack.

This causes the stack tracer to show some bogus results for the size
of the last function traced, as well as showing "ftrace_call" instead
of the function. This is due to the stack frame not being set up
by the function that is about to be traced.

 # cat stack_trace
        Depth    Size   Location    (48 entries)
        -----    ----   --------
  0)     4824     216   ftrace_call+0x5/0x2f
  1)     4608     112   ____cache_alloc+0xb7/0x22d
  2)     4496      80   kmem_cache_alloc+0x63/0x12f

The 216 size for ftrace_call includes both the ftrace_call stack
(which includes the saving of registers it does), as well as the
stack size of the parent.

To fix this, if CC_USING_FENTRY is defined, then the stack_tracer
will reserve the first item in stack_dump_trace[] array when
calling save_stack_trace(), and it will fill it in with the parent ip.
Then the code will look for the parent pointer on the stack and
give the real size of the parent's stack pointer:

 # cat stack_trace
        Depth    Size   Location    (14 entries)
        -----    ----   --------
  0)     2640      48   update_group_power+0x26/0x187
  1)     2592     224   update_sd_lb_stats+0x2a5/0x4ac
  2)     2368     160   find_busiest_group+0x31/0x1f1
  3)     2208     256   load_balance+0xd9/0x662

I'm Cc'ing stable, although it's not urgent, as it only shows bogus
size for item #0, the rest of the trace is legit. It should still be
corrected in previous stable releases.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index dc02e29d8255..ea28e4b0ed58 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,27 @@
 
 #define STACK_TRACE_ENTRIES 500
 
+/*
+ * If fentry is used, then the function being traced will
+ * jump to fentry directly before it sets up its stack frame.
+ * We need to ignore that one and record the parent. Since
+ * the stack frame for the traced function wasn't set up yet,
+ * the stack_trace wont see the parent. That needs to be added
+ * manually to stack_dump_trace[] as the first element.
+ */
+#ifdef CC_USING_FENTRY
+# define add_func	1
+#else
+# define add_func	0
+#endif
+
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
 	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
 
 static struct stack_trace max_stack_trace = {
-	.max_entries		= STACK_TRACE_ENTRIES,
-	.entries		= stack_dump_trace,
+	.max_entries		= STACK_TRACE_ENTRIES - add_func,
+	.entries		= &stack_dump_trace[add_func],
 };
 
 static unsigned long max_stack_size;
@@ -40,7 +54,7 @@ int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
 
 static inline void
-check_stack(unsigned long *stack)
+check_stack(unsigned long ip, unsigned long *stack)
 {
 	unsigned long this_size, flags;
 	unsigned long *p, *top, *start;
@@ -70,6 +84,17 @@ check_stack(unsigned long *stack)
 
 	save_stack_trace(&max_stack_trace);
 
+	/*
+	 * When fentry is used, the traced function does not get
+	 * its stack frame set up, and we lose the parent.
+	 * Add that one in manally. We set up save_stack_trace()
+	 * to not touch the first element in this case.
+	 */
+	if (add_func) {
+		stack_dump_trace[0] = ip;
+		max_stack_trace.nr_entries++;
+	}
+
 	/*
 	 * Now find where in the stack these are.
 	 */
@@ -124,7 +149,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (per_cpu(trace_active, cpu)++ != 0)
 		goto out;
 
-	check_stack(&stack);
+	check_stack(parent_ip, &stack);
 
  out:
 	per_cpu(trace_active, cpu)--;
-- 
cgit v1.2.3


From 4df297129f622bdc18935c856f42b9ddd18f9f28 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 23:34:22 -0400
Subject: tracing: Remove most or all of stack tracer stack size from
 stack_max_size

Currently, the depth reported in the stack tracer stack_trace file
does not match the stack_max_size file. This is because the stack_max_size
includes the overhead of stack tracer itself while the depth does not.

The first time a max is triggered, a calculation is not performed that
figures out the overhead of the stack tracer and subtracts it from
the stack_max_size variable. The overhead is stored and is subtracted
from the reported stack size for comparing for a new max.

Now the stack_max_size corresponds to the reported depth:

 # cat stack_max_size
4640

 # cat stack_trace
        Depth    Size   Location    (48 entries)
        -----    ----   --------
  0)     4640      32   _raw_spin_lock+0x18/0x24
  1)     4608     112   ____cache_alloc+0xb7/0x22d
  2)     4496      80   kmem_cache_alloc+0x63/0x12f
  3)     4416      16   mempool_alloc_slab+0x15/0x17
[...]

While testing against and older gcc on x86 that uses mcount instead
of fentry, I found that pasing in ip + MCOUNT_INSN_SIZE let the
stack trace show one more function deep which was missing before.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 75 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 21 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index ea28e4b0ed58..aab277b67fa9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,27 +20,24 @@
 
 #define STACK_TRACE_ENTRIES 500
 
-/*
- * If fentry is used, then the function being traced will
- * jump to fentry directly before it sets up its stack frame.
- * We need to ignore that one and record the parent. Since
- * the stack frame for the traced function wasn't set up yet,
- * the stack_trace wont see the parent. That needs to be added
- * manually to stack_dump_trace[] as the first element.
- */
 #ifdef CC_USING_FENTRY
-# define add_func	1
+# define fentry		1
 #else
-# define add_func	0
+# define fentry		0
 #endif
 
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
 	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
 
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
 static struct stack_trace max_stack_trace = {
-	.max_entries		= STACK_TRACE_ENTRIES - add_func,
-	.entries		= &stack_dump_trace[add_func],
+	.max_entries		= STACK_TRACE_ENTRIES - 1,
+	.entries		= &stack_dump_trace[1],
 };
 
 static unsigned long max_stack_size;
@@ -58,10 +55,14 @@ check_stack(unsigned long ip, unsigned long *stack)
 {
 	unsigned long this_size, flags;
 	unsigned long *p, *top, *start;
+	static int tracer_frame;
+	int frame_size = ACCESS_ONCE(tracer_frame);
 	int i;
 
 	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
 	this_size = THREAD_SIZE - this_size;
+	/* Remove the frame of the tracer */
+	this_size -= frame_size;
 
 	if (this_size <= max_stack_size)
 		return;
@@ -73,6 +74,10 @@ check_stack(unsigned long ip, unsigned long *stack)
 	local_irq_save(flags);
 	arch_spin_lock(&max_stack_lock);
 
+	/* In case another CPU set the tracer_frame on us */
+	if (unlikely(!frame_size))
+		this_size -= tracer_frame;
+
 	/* a race could have already updated it */
 	if (this_size <= max_stack_size)
 		goto out;
@@ -85,15 +90,12 @@ check_stack(unsigned long ip, unsigned long *stack)
 	save_stack_trace(&max_stack_trace);
 
 	/*
-	 * When fentry is used, the traced function does not get
-	 * its stack frame set up, and we lose the parent.
-	 * Add that one in manally. We set up save_stack_trace()
-	 * to not touch the first element in this case.
+	 * Add the passed in ip from the function tracer.
+	 * Searching for this on the stack will skip over
+	 * most of the overhead from the stack tracer itself.
 	 */
-	if (add_func) {
-		stack_dump_trace[0] = ip;
-		max_stack_trace.nr_entries++;
-	}
+	stack_dump_trace[0] = ip;
+	max_stack_trace.nr_entries++;
 
 	/*
 	 * Now find where in the stack these are.
@@ -123,6 +125,18 @@ check_stack(unsigned long ip, unsigned long *stack)
 				found = 1;
 				/* Start the search from here */
 				start = p + 1;
+				/*
+				 * We do not want to show the overhead
+				 * of the stack tracer stack in the
+				 * max stack. If we haven't figured
+				 * out what that is, then figure it out
+				 * now.
+				 */
+				if (unlikely(!tracer_frame) && i == 1) {
+					tracer_frame = (p - stack) *
+						sizeof(unsigned long);
+					max_stack_size -= tracer_frame;
+				}
 			}
 		}
 
@@ -149,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (per_cpu(trace_active, cpu)++ != 0)
 		goto out;
 
-	check_stack(parent_ip, &stack);
+	/*
+	 * When fentry is used, the traced function does not get
+	 * its stack frame set up, and we lose the parent.
+	 * The ip is pretty useless because the function tracer
+	 * was called before that function set up its stack frame.
+	 * In this case, we use the parent ip.
+	 *
+	 * By adding the return address of either the parent ip
+	 * or the current ip we can disregard most of the stack usage
+	 * caused by the stack tracer itself.
+	 *
+	 * The function tracer always reports the address of where the
+	 * mcount call was, but the stack will hold the return address.
+	 */
+	if (fentry)
+		ip = parent_ip;
+	else
+		ip += MCOUNT_INSN_SIZE;
+
+	check_stack(ip, &stack);
 
  out:
 	per_cpu(trace_active, cpu)--;
-- 
cgit v1.2.3


From 328df4759c03e2c3e7429cc6cb0e180c38f32063 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 12:10:40 -0400
Subject: tracing: Add function-trace option to disable function tracing of
 latency tracers

Currently, the only way to stop the latency tracers from doing function
tracing is to fully disable the function tracer from the proc file
system:

  echo 0 > /proc/sys/kernel/ftrace_enabled

This is a big hammer approach as it disables function tracing for
all users. This includes kprobes, perf, stack tracer, etc.

Instead, create a function-trace option that the latency tracers can
check to determine if it should enable function tracing or not.
This option can be set or cleared even while the tracer is active
and the tracers will disable or enable function tracing depending
on how the option was set.

Instead of using the proc file, disable latency function tracing with

  echo 0 > /debug/tracing/options/function-trace

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              |  3 +-
 kernel/trace/trace.h              |  1 +
 kernel/trace/trace_irqsoff.c      | 67 ++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_sched_wakeup.c | 63 ++++++++++++++++++++++++++++++------
 4 files changed, 111 insertions(+), 23 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8aa53213201f..f90ca16afcf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -328,7 +328,7 @@ static inline void trace_access_lock_init(void)
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
 	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
-	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
+	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
 
 /**
  * tracing_on - enable tracing buffers
@@ -635,6 +635,7 @@ static const char *trace_options[] = {
 	"disable_on_free",
 	"irq-info",
 	"markers",
+	"function-trace",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0e430b401ab6..5cc52361bc9f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -867,6 +867,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_STOP_ON_FREE		= 0x400000,
 	TRACE_ITER_IRQ_INFO		= 0x800000,
 	TRACE_ITER_MARKERS		= 0x1000000,
+	TRACE_ITER_FUNCTION		= 0x2000000,
 };
 
 /*
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5aa40ab72b57..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -33,6 +33,7 @@ enum {
 static int trace_type __read_mostly;
 
 static int save_flags;
+static bool function_enabled;
 
 static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
 static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+static int register_irqsoff_function(int graph, int set)
 {
-	int ret = 0;
+	int ret;
 
-	if (!graph)
-		ret = register_ftrace_function(&trace_ops);
-	else
+	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+		return 0;
+
+	if (graph)
 		ret = register_ftrace_graph(&irqsoff_graph_return,
 					    &irqsoff_graph_entry);
+	else
+		ret = register_ftrace_function(&trace_ops);
+
+	if (!ret)
+		function_enabled = true;
+
+	return ret;
+}
+
+static void unregister_irqsoff_function(int graph)
+{
+	if (!function_enabled)
+		return;
+
+	if (graph)
+		unregister_ftrace_graph();
+	else
+		unregister_ftrace_function(&trace_ops);
+
+	function_enabled = false;
+}
+
+static void irqsoff_function_set(int set)
+{
+	if (set)
+		register_irqsoff_function(is_graph(), 1);
+	else
+		unregister_irqsoff_function(is_graph());
+}
+
+static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+	if (mask & TRACE_ITER_FUNCTION)
+		irqsoff_function_set(set);
+
+	return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+	int ret;
+
+	ret = register_irqsoff_function(graph, 0);
 
 	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
@@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
 
-	if (!graph)
-		unregister_ftrace_function(&trace_ops);
-	else
-		unregister_ftrace_graph();
+	unregister_irqsoff_function(graph);
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
-	.flag_changed	= trace_keep_overwrite,
+	.flag_changed	= irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
 #endif
@@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
-	.flag_changed	= trace_keep_overwrite,
+	.flag_changed	= irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
 #endif
@@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
-	.flag_changed	= trace_keep_overwrite,
+	.flag_changed	= irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c16f8cd63c3c..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
 static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 
 static int save_flags;
+static bool function_enabled;
 
 #define TRACE_DISPLAY_GRAPH     1
 
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
-static int start_func_tracer(int graph)
+static int register_wakeup_function(int graph, int set)
 {
 	int ret;
 
-	if (!graph)
-		ret = register_ftrace_function(&trace_ops);
-	else
+	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+		return 0;
+
+	if (graph)
 		ret = register_ftrace_graph(&wakeup_graph_return,
 					    &wakeup_graph_entry);
+	else
+		ret = register_ftrace_function(&trace_ops);
+
+	if (!ret)
+		function_enabled = true;
+
+	return ret;
+}
+
+static void unregister_wakeup_function(int graph)
+{
+	if (!function_enabled)
+		return;
+
+	if (graph)
+		unregister_ftrace_graph();
+	else
+		unregister_ftrace_function(&trace_ops);
+
+	function_enabled = false;
+}
+
+static void wakeup_function_set(int set)
+{
+	if (set)
+		register_wakeup_function(is_graph(), 1);
+	else
+		unregister_wakeup_function(is_graph());
+}
+
+static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+	if (mask & TRACE_ITER_FUNCTION)
+		wakeup_function_set(set);
+
+	return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(int graph)
+{
+	int ret;
+
+	ret = register_wakeup_function(graph, 0);
 
 	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
 {
 	tracer_enabled = 0;
 
-	if (!graph)
-		unregister_ftrace_function(&trace_ops);
-	else
-		unregister_ftrace_graph();
+	unregister_wakeup_function(graph);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
 	.print_line	= wakeup_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= wakeup_set_flag,
-	.flag_changed	= trace_keep_overwrite,
+	.flag_changed	= wakeup_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
@@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.print_line	= wakeup_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= wakeup_set_flag,
-	.flag_changed	= trace_keep_overwrite,
+	.flag_changed	= wakeup_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
-- 
cgit v1.2.3


From 8aacf017b065a805d27467843490c976835eb4a5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 13:13:45 -0400
Subject: tracing: Add "uptime" trace clock that uses jiffies

Add a simple trace clock called "uptime" for those that are
interested in the uptime of the trace. It uses jiffies as that's
the safest method, as other uptime clocks grab seq locks, which could
cause a deadlock if taken from an event or function tracer.

Requested-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c       |  1 +
 kernel/trace/trace_clock.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f90ca16afcf2..8eabfbb8003e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -647,6 +647,7 @@ static struct {
 	{ trace_clock_local,	"local",	1 },
 	{ trace_clock_global,	"global",	1 },
 	{ trace_clock_counter,	"counter",	0 },
+	{ trace_clock_jiffies,	"uptime",	1 },
 	ARCH_TRACE_CLOCKS
 };
 
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
 	return local_clock();
 }
 
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+	u64 jiffy = jiffies - INITIAL_JIFFIES;
+
+	/* Return nsecs */
+	return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+}
 
 /*
  * trace_clock_global(): special globally coherent trace clock
-- 
cgit v1.2.3


From 76f119179b8ce3188a8c61d2486d37810a416655 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 17:53:25 -0400
Subject: tracing: Add "perf" trace_clock

The function trace_clock() calls "local_clock()" which is exactly
the same clock that perf uses. I'm not sure why perf doesn't call
trace_clock(), as trace_clock() doesn't have any users.

But now it does. As trace_clock() calls local_clock() like perf does,
I added the trace_clock "perf" option that uses trace_clock().

Now the ftrace buffers can use the same clock as perf uses. This
will be useful when perf starts reading the ftrace buffers, and will
be able to interleave them with the same clock data.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8eabfbb8003e..7f0e7fa6d62c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -648,6 +648,7 @@ static struct {
 	{ trace_clock_global,	"global",	1 },
 	{ trace_clock_counter,	"counter",	0 },
 	{ trace_clock_jiffies,	"uptime",	1 },
+	{ trace_clock,		"perf",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
-- 
cgit v1.2.3


From 6c43e554a2a5c1f2caf1733d46719bc58de3e37b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 15 Mar 2013 11:32:53 -0400
Subject: ring-buffer: Add ring buffer startup selftest

When testing my large changes to the ftrace system, there was
a bug that looked like the ring buffer was dropping events.
I wrote up a quick integrity checker of the ring buffer to
see if it was.

Although the bug ended up being something stupid I did in ftrace,
and had nothing to do with the ring buffer, I figured if I spent
the time to write up this test, I might as well include it in the
kernel.

I cleaned it up a bit, as the original version was rather ugly.
Not saying this version is pretty, but it's a beauty queen
compared to what I original wrote.

To enable the start up test, set CONFIG_RING_BUFFER_STARTUP_TEST.

Note, it runs for 10 seconds, so it will slow your boot time
by at least 10 more seconds.

What it does is documented in both the comments and the Kconfig
help.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       |  23 ++++
 kernel/trace/ring_buffer.c | 319 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 342 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f78eab251897..0b5ecf5517fa 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -565,6 +565,29 @@ config RING_BUFFER_BENCHMARK
 
 	  If unsure, say N.
 
+config RING_BUFFER_STARTUP_TEST
+       bool "Ring buffer startup self test"
+       depends on RING_BUFFER
+       help
+         Run a simple self test on the ring buffer on boot up. Late in the
+	 kernel boot sequence, the test will start that kicks off
+	 a thread per cpu. Each thread will write various size events
+	 into the ring buffer. Another thread is created to send IPIs
+	 to each of the threads, where the IPI handler will also write
+	 to the ring buffer, to test/stress the nesting ability.
+	 If any anomalies are discovered, a warning will be displayed
+	 and all ring buffers will be disabled.
+
+	 The test runs for 10 seconds. This will slow your boot time
+	 by at least 10 more seconds.
+
+	 At the end of the test, statics and more checks are done.
+	 It will output the stats of each per cpu buffer. What
+	 was written, the sizes, what was read, what was lost, and
+	 other similar details.
+
+	 If unsure, say N
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d1c85c5f5f51..e5472f7bc347 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -12,10 +12,12 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kthread.h>	/* for self test */
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
@@ -4634,3 +4636,320 @@ static int rb_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 #endif
+
+#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
+/*
+ * This is a basic integrity check of the ring buffer.
+ * Late in the boot cycle this test will run when configured in.
+ * It will kick off a thread per CPU that will go into a loop
+ * writing to the per cpu ring buffer various sizes of data.
+ * Some of the data will be large items, some small.
+ *
+ * Another thread is created that goes into a spin, sending out
+ * IPIs to the other CPUs to also write into the ring buffer.
+ * this is to test the nesting ability of the buffer.
+ *
+ * Basic stats are recorded and reported. If something in the
+ * ring buffer should happen that's not expected, a big warning
+ * is displayed and all ring buffers are disabled.
+ */
+static struct task_struct *rb_threads[NR_CPUS] __initdata;
+
+struct rb_test_data {
+	struct ring_buffer	*buffer;
+	unsigned long		events;
+	unsigned long		bytes_written;
+	unsigned long		bytes_alloc;
+	unsigned long		bytes_dropped;
+	unsigned long		events_nested;
+	unsigned long		bytes_written_nested;
+	unsigned long		bytes_alloc_nested;
+	unsigned long		bytes_dropped_nested;
+	int			min_size_nested;
+	int			max_size_nested;
+	int			max_size;
+	int			min_size;
+	int			cpu;
+	int			cnt;
+};
+
+static struct rb_test_data rb_data[NR_CPUS] __initdata;
+
+/* 1 meg per cpu */
+#define RB_TEST_BUFFER_SIZE	1048576
+
+static char rb_string[] __initdata =
+	"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
+	"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
+	"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
+
+static bool rb_test_started __initdata;
+
+struct rb_item {
+	int size;
+	char str[];
+};
+
+static __init int rb_write_something(struct rb_test_data *data, bool nested)
+{
+	struct ring_buffer_event *event;
+	struct rb_item *item;
+	bool started;
+	int event_len;
+	int size;
+	int len;
+	int cnt;
+
+	/* Have nested writes different that what is written */
+	cnt = data->cnt + (nested ? 27 : 0);
+
+	/* Multiply cnt by ~e, to make some unique increment */
+	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+
+	len = size + sizeof(struct rb_item);
+
+	started = rb_test_started;
+	/* read rb_test_started before checking buffer enabled */
+	smp_rmb();
+
+	event = ring_buffer_lock_reserve(data->buffer, len);
+	if (!event) {
+		/* Ignore dropped events before test starts. */
+		if (started) {
+			if (nested)
+				data->bytes_dropped += len;
+			else
+				data->bytes_dropped_nested += len;
+		}
+		return len;
+	}
+
+	event_len = ring_buffer_event_length(event);
+
+	if (RB_WARN_ON(data->buffer, event_len < len))
+		goto out;
+
+	item = ring_buffer_event_data(event);
+	item->size = size;
+	memcpy(item->str, rb_string, size);
+
+	if (nested) {
+		data->bytes_alloc_nested += event_len;
+		data->bytes_written_nested += len;
+		data->events_nested++;
+		if (!data->min_size_nested || len < data->min_size_nested)
+			data->min_size_nested = len;
+		if (len > data->max_size_nested)
+			data->max_size_nested = len;
+	} else {
+		data->bytes_alloc += event_len;
+		data->bytes_written += len;
+		data->events++;
+		if (!data->min_size || len < data->min_size)
+			data->max_size = len;
+		if (len > data->max_size)
+			data->max_size = len;
+	}
+
+ out:
+	ring_buffer_unlock_commit(data->buffer, event);
+
+	return 0;
+}
+
+static __init int rb_test(void *arg)
+{
+	struct rb_test_data *data = arg;
+
+	while (!kthread_should_stop()) {
+		rb_write_something(data, false);
+		data->cnt++;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		/* Now sleep between a min of 100-300us and a max of 1ms */
+		usleep_range(((data->cnt % 3) + 1) * 100, 1000);
+	}
+
+	return 0;
+}
+
+static __init void rb_ipi(void *ignore)
+{
+	struct rb_test_data *data;
+	int cpu = smp_processor_id();
+
+	data = &rb_data[cpu];
+	rb_write_something(data, true);
+}
+
+static __init int rb_hammer_test(void *arg)
+{
+	while (!kthread_should_stop()) {
+
+		/* Send an IPI to all cpus to write data! */
+		smp_call_function(rb_ipi, NULL, 1);
+		/* No sleep, but for non preempt, let others run */
+		schedule();
+	}
+
+	return 0;
+}
+
+static __init int test_ringbuffer(void)
+{
+	struct task_struct *rb_hammer;
+	struct ring_buffer *buffer;
+	int cpu;
+	int ret = 0;
+
+	pr_info("Running ring buffer tests...\n");
+
+	buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
+	if (WARN_ON(!buffer))
+		return 0;
+
+	/* Disable buffer so that threads can't write to it yet */
+	ring_buffer_record_off(buffer);
+
+	for_each_online_cpu(cpu) {
+		rb_data[cpu].buffer = buffer;
+		rb_data[cpu].cpu = cpu;
+		rb_data[cpu].cnt = cpu;
+		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
+						 "rbtester/%d", cpu);
+		if (WARN_ON(!rb_threads[cpu])) {
+			pr_cont("FAILED\n");
+			ret = -1;
+			goto out_free;
+		}
+
+		kthread_bind(rb_threads[cpu], cpu);
+ 		wake_up_process(rb_threads[cpu]);
+	}
+
+	/* Now create the rb hammer! */
+	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
+	if (WARN_ON(!rb_hammer)) {
+		pr_cont("FAILED\n");
+		ret = -1;
+		goto out_free;
+	}
+
+	ring_buffer_record_on(buffer);
+	/*
+	 * Show buffer is enabled before setting rb_test_started.
+	 * Yes there's a small race window where events could be
+	 * dropped and the thread wont catch it. But when a ring
+	 * buffer gets enabled, there will always be some kind of
+	 * delay before other CPUs see it. Thus, we don't care about
+	 * those dropped events. We care about events dropped after
+	 * the threads see that the buffer is active.
+	 */
+	smp_wmb();
+	rb_test_started = true;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	/* Just run for 10 seconds */;
+	schedule_timeout(10 * HZ);
+
+	kthread_stop(rb_hammer);
+
+ out_free:
+	for_each_online_cpu(cpu) {
+		if (!rb_threads[cpu])
+			break;
+		kthread_stop(rb_threads[cpu]);
+	}
+	if (ret) {
+		ring_buffer_free(buffer);
+		return ret;
+	}
+
+	/* Report! */
+	pr_info("finished\n");
+	for_each_online_cpu(cpu) {
+		struct ring_buffer_event *event;
+		struct rb_test_data *data = &rb_data[cpu];
+		struct rb_item *item;
+		unsigned long total_events;
+		unsigned long total_dropped;
+		unsigned long total_written;
+		unsigned long total_alloc;
+		unsigned long total_read = 0;
+		unsigned long total_size = 0;
+		unsigned long total_len = 0;
+		unsigned long total_lost = 0;
+		unsigned long lost;
+		int big_event_size;
+		int small_event_size;
+
+		ret = -1;
+
+		total_events = data->events + data->events_nested;
+		total_written = data->bytes_written + data->bytes_written_nested;
+		total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
+		total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
+
+		big_event_size = data->max_size + data->max_size_nested;
+		small_event_size = data->min_size + data->min_size_nested;
+
+		pr_info("CPU %d:\n", cpu);
+		pr_info("              events:    %ld\n", total_events);
+		pr_info("       dropped bytes:    %ld\n", total_dropped);
+		pr_info("       alloced bytes:    %ld\n", total_alloc);
+		pr_info("       written bytes:    %ld\n", total_written);
+		pr_info("       biggest event:    %d\n", big_event_size);
+		pr_info("      smallest event:    %d\n", small_event_size);
+
+		if (RB_WARN_ON(buffer, total_dropped))
+			break;
+
+		ret = 0;
+
+		while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
+			total_lost += lost;
+			item = ring_buffer_event_data(event);
+			total_len += ring_buffer_event_length(event);
+			total_size += item->size + sizeof(struct rb_item);
+			if (memcmp(&item->str[0], rb_string, item->size) != 0) {
+				pr_info("FAILED!\n");
+				pr_info("buffer had: %.*s\n", item->size, item->str);
+				pr_info("expected:   %.*s\n", item->size, rb_string);
+				RB_WARN_ON(buffer, 1);
+				ret = -1;
+				break;
+			}
+			total_read++;
+		}
+		if (ret)
+			break;
+
+		ret = -1;
+
+		pr_info("         read events:   %ld\n", total_read);
+		pr_info("         lost events:   %ld\n", total_lost);
+		pr_info("        total events:   %ld\n", total_lost + total_read);
+		pr_info("  recorded len bytes:   %ld\n", total_len);
+		pr_info(" recorded size bytes:   %ld\n", total_size);
+		if (total_lost)
+			pr_info(" With dropped events, record len and size may not match\n"
+				" alloced and written from above\n");
+		if (!total_lost) {
+			if (RB_WARN_ON(buffer, total_len != total_alloc ||
+				       total_size != total_written))
+				break;
+		}
+		if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
+			break;
+
+		ret = 0;
+	}
+	if (!ret)
+		pr_info("Ring buffer PASSED!\n");
+
+	ring_buffer_free(buffer);
+	return 0;
+}
+
+late_initcall(test_ringbuffer);
+#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
-- 
cgit v1.2.3


From 687c878afb526a0c3117dbc408ca76ad80d689f7 Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:13:29 +0800
Subject: tracing: Use pr_warn_once instead of open coded implementation

Use pr_warn_once, instead of making an open coded implementation.

Link: http://lkml.kernel.org/r/513D8419.20400@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7f0e7fa6d62c..bba1ba958ee8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5205,8 +5205,6 @@ static inline int register_snapshot_cmd(void) { return 0; }
 
 struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
 {
-	static int once;
-
 	if (tr->dir)
 		return tr->dir;
 
@@ -5216,11 +5214,8 @@ struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
 	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
 		tr->dir = debugfs_create_dir("tracing", NULL);
 
-	if (!tr->dir && !once) {
-		once = 1;
-		pr_warning("Could not create debugfs directory 'tracing'\n");
-		return NULL;
-	}
+	if (!tr->dir)
+		pr_warn_once("Could not create debugfs directory 'tracing'\n");
 
 	return tr->dir;
 }
-- 
cgit v1.2.3


From bd6df18716fa45bc4aa9587aca033de909e5382b Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:13:37 +0800
Subject: tracing: Use TRACE_MAX_PRINT instead of constant

TRACE_MAX_PRINT macro is defined, but is not used.

Link: http://lkml.kernel.org/r/513D8421.4070404@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bba1ba958ee8..848625674752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5978,8 +5978,8 @@ void
 trace_printk_seq(struct trace_seq *s)
 {
 	/* Probably should print a warning here. */
-	if (s->len >= 1000)
-		s->len = 1000;
+	if (s->len >= TRACE_MAX_PRINT)
+		s->len = TRACE_MAX_PRINT;
 
 	/* should be zero ended, but we are paranoid. */
 	s->buffer[s->len] = 0;
-- 
cgit v1.2.3


From b3a8c6fd7bb61c910bd4f80ae1d75056e8f98c19 Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:13:42 +0800
Subject: tracing: Move find_event_field() into trace_events.c

By moving find_event_field() and trace_find_field() into trace_events.c,
the ftrace_common_fields list and trace_get_fields() can become local to
the trace_events.c file.

find_event_field() is renamed to trace_find_event_field() to conform to
the tracing global function names.

Link: http://lkml.kernel.org/r/513D8426.9070109@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
[ rostedt: Modified trace_find_field() to trace_find_event_field() ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |  6 ++----
 kernel/trace/trace_events.c        | 31 +++++++++++++++++++++++++++++--
 kernel/trace/trace_events_filter.c | 29 +----------------------------
 3 files changed, 32 insertions(+), 34 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5cc52361bc9f..9e014582e763 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -995,8 +995,6 @@ struct filter_pred {
 	unsigned short		right;
 };
 
-extern struct list_head ftrace_common_fields;
-
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
@@ -1009,8 +1007,8 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
 extern int filter_assign_type(const char *type);
 
-struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call);
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c636523b1a59..ba523d7beea2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,7 +34,7 @@ char event_storage[EVENT_STORAGE_SIZE];
 EXPORT_SYMBOL_GPL(event_storage);
 
 LIST_HEAD(ftrace_events);
-LIST_HEAD(ftrace_common_fields);
+static LIST_HEAD(ftrace_common_fields);
 
 #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
 
@@ -54,7 +54,7 @@ static struct kmem_cache *file_cachep;
 #define while_for_each_event_file()		\
 	}
 
-struct list_head *
+static struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
 {
 	if (!event_call->class->get_fields)
@@ -62,6 +62,33 @@ trace_get_fields(struct ftrace_event_call *event_call)
 	return event_call->class->get_fields(event_call);
 }
 
+static struct ftrace_event_field *
+__find_event_field(struct list_head *head, char *name)
+{
+	struct ftrace_event_field *field;
+
+	list_for_each_entry(field, head, link) {
+		if (!strcmp(field->name, name))
+			return field;
+	}
+
+	return NULL;
+}
+
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *head;
+
+	field = __find_event_field(&ftrace_common_fields, name);
+	if (field)
+		return field;
+
+	head = trace_get_fields(call);
+	return __find_event_field(head, name);
+}
+
 static int __trace_define_field(struct list_head *head, const char *type,
 				const char *name, int offset, int size,
 				int is_signed, int filter_type)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2a22a177ab44..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 	mutex_unlock(&event_mutex);
 }
 
-static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
-{
-	struct ftrace_event_field *field;
-
-	list_for_each_entry(field, head, link) {
-		if (!strcmp(field->name, name))
-			return field;
-	}
-
-	return NULL;
-}
-
-static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
-{
-	struct ftrace_event_field *field;
-	struct list_head *head;
-
-	field = __find_event_field(&ftrace_common_fields, name);
-	if (field)
-		return field;
-
-	head = trace_get_fields(call);
-	return __find_event_field(head, name);
-}
-
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
 	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
 		return NULL;
 	}
 
-	field = find_event_field(call, operand1);
+	field = trace_find_event_field(call, operand1);
 	if (!field) {
 		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return NULL;
-- 
cgit v1.2.3


From ad7067cebf3253412a7c0a169a9dd056b11e69ac Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:13:46 +0800
Subject: tracing: Convert trace_destroy_fields() to static

trace_destroy_fields() is not used outside of the file. It can be
a static function.

Link: http://lkml.kernel.org/r/513D842A.2000907@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ba523d7beea2..a71cdc3c5df9 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -158,7 +158,7 @@ static int trace_define_common_fields(void)
 	return ret;
 }
 
-void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct ftrace_event_call *call)
 {
 	struct ftrace_event_field *field, *next;
 	struct list_head *head;
-- 
cgit v1.2.3


From 36a78e9e8792bfb052643eaf9374f837e634982c Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:13:51 +0800
Subject: tracing: Fix comment about prefix in arch_syscall_match_sym_name()

ppc64 has its own syscall prefix like ".SyS" or ".sys". Make the
comment in arch_syscall_match_sym_name() more understandable.

Link: http://lkml.kernel.org/r/513D842F.40205@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 68f3f344be65..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -37,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
 	/*
 	 * Only compare after the "sys" prefix. Archs that use
 	 * syscall wrappers may have syscalls symbols aliases prefixed
-	 * with "SyS" instead of "sys", leading to an unwanted
+	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
 	 * mismatch.
 	 */
 	return !strcmp(sym + 3, name + 3);
-- 
cgit v1.2.3


From 52f6ad6dc3f4c6de598fe7cc9b629888d624aa52 Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Mon, 11 Mar 2013 15:14:03 +0800
Subject: tracing: Rename trace_event_mutex to trace_event_sem

trace_event_mutex is an rw semaphore now, not a mutex, change the name.

Link: http://lkml.kernel.org/r/513D843B.40109@huawei.com

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
[ Forward ported to my new code ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 22 +++++++++++-----------
 kernel/trace/trace_output.c | 16 ++++++++--------
 kernel/trace/trace_output.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a71cdc3c5df9..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1584,7 +1584,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
 }
 
 /*
- * Must be called under locking both of event_mutex and trace_event_mutex.
+ * Must be called under locking both of event_mutex and trace_event_sem.
  */
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
@@ -1597,9 +1597,9 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 void trace_remove_event_call(struct ftrace_event_call *call)
 {
 	mutex_lock(&event_mutex);
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	__trace_remove_event_call(call);
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 	mutex_unlock(&event_mutex);
 }
 
@@ -1707,7 +1707,7 @@ static void trace_module_remove_events(struct module *mod)
 	struct ftrace_event_call *call, *p;
 	bool clear_trace = false;
 
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
@@ -1725,7 +1725,7 @@ static void trace_module_remove_events(struct module *mod)
 		list_del(&file_ops->list);
 		kfree(file_ops);
 	}
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
 	/*
 	 * It is safest to reset the ring buffer if the module being unloaded
@@ -2262,9 +2262,9 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 	if (ret)
 		goto out_unlock;
 
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	__trace_add_event_dirs(tr);
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
  out_unlock:
 	mutex_unlock(&event_mutex);
@@ -2287,9 +2287,9 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
 	if (ret)
 		goto out_unlock;
 
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	__trace_early_add_event_dirs(tr);
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
  out_unlock:
 	mutex_unlock(&event_mutex);
@@ -2304,10 +2304,10 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	mutex_lock(&event_mutex);
 
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
 	debugfs_remove_recursive(tr->event_dir);
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19f48e7edc39..f475b2a7ac88 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_sem);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
@@ -826,12 +826,12 @@ static int trace_search_list(struct list_head **list)
 
 void trace_event_read_lock(void)
 {
-	down_read(&trace_event_mutex);
+	down_read(&trace_event_sem);
 }
 
 void trace_event_read_unlock(void)
 {
-	up_read(&trace_event_mutex);
+	up_read(&trace_event_sem);
 }
 
 /**
@@ -854,7 +854,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -909,14 +909,14 @@ int register_ftrace_event(struct trace_event *event)
 
 	ret = event->type;
  out:
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 
 /*
- * Used by module code with the trace_event_mutex held for write.
+ * Used by module code with the trace_event_sem held for write.
  */
 int __unregister_ftrace_event(struct trace_event *event)
 {
@@ -931,9 +931,9 @@ int __unregister_ftrace_event(struct trace_event *event)
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	down_write(&trace_event_mutex);
+	down_write(&trace_event_sem);
 	__unregister_ftrace_event(event);
-	up_write(&trace_event_mutex);
+	up_write(&trace_event_sem);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index af77870de278..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
 /* used by module unregistering */
 extern int __unregister_ftrace_event(struct trace_event *event);
-extern struct rw_semaphore trace_event_mutex;
+extern struct rw_semaphore trace_event_sem;
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3


From 7fe70b579c9e3daba71635e31b6189394e7b79d3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 15 Mar 2013 13:10:35 -0400
Subject: tracing: Fix ftrace_dump()

ftrace_dump() had a lot of issues. What ftrace_dump() does, is when
ftrace_dump_on_oops is set (via a kernel parameter or sysctl), it
will dump out the ftrace buffers to the console when either a oops,
panic, or a sysrq-z occurs.

This was written a long time ago when ftrace was fragile to recursion.
But it wasn't written well even for that.

There's a possible deadlock that can occur if a ftrace_dump() is happening
and an NMI triggers another dump. This is because it grabs a lock
before checking if the dump ran.

It also totally disables ftrace, and tracing for no good reasons.

As the ring_buffer now checks if it is read via a oops or NMI, where
there's a chance that the buffer gets corrupted, it will disable
itself. No need to have ftrace_dump() do the same.

ftrace_dump() is now cleaned up where it uses an atomic counter to
make sure only one dump happens at a time. A simple atomic_inc_return()
is enough that is needed for both other CPUs and NMIs. No need for
a spinlock, as if one CPU is running the dump, no other CPU needs
to do it too.

The tracing_on variable is turned off and not turned on. The original
code did this, but it wasn't pretty. By just disabling this variable
we get the result of not seeing traces that happen between crashes.

For sysrq-z, it doesn't get turned on, but the user can always write
a '1' to the tracing_on file. If they are using sysrq-z, then they should
know about tracing_on.

The new code is much easier to read and less error prone. No more
deadlock possibility when an NMI triggers here.

Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Cc: stable@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c          | 62 ++++++++++++++++++-------------------------
 kernel/trace/trace_selftest.c |  9 ++++---
 2 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 848625674752..3dc7999594e1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5997,36 +5997,32 @@ void trace_init_global_iter(struct trace_iterator *iter)
 	iter->trace_buffer = &global_trace.trace_buffer;
 }
 
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-	static arch_spinlock_t ftrace_dump_lock =
-		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
+	static atomic_t dump_running;
 	unsigned int old_userobj;
-	static int dump_ran;
 	unsigned long flags;
 	int cnt = 0, cpu;
 
-	/* only one dump */
-	local_irq_save(flags);
-	arch_spin_lock(&ftrace_dump_lock);
-	if (dump_ran)
-		goto out;
-
-	dump_ran = 1;
+	/* Only allow one dump user at a time. */
+	if (atomic_inc_return(&dump_running) != 1) {
+		atomic_dec(&dump_running);
+		return;
+	}
 
+	/*
+	 * Always turn off tracing when we dump.
+	 * We don't need to show trace output of what happens
+	 * between multiple crashes.
+	 *
+	 * If the user does a sysrq-z, then they can re-enable
+	 * tracing with echo 1 > tracing_on.
+	 */
 	tracing_off();
 
-	/* Did function tracer already get disabled? */
-	if (ftrace_is_dead()) {
-		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
-		printk("#          MAY BE MISSING FUNCTION EVENTS\n");
-	}
-
-	if (disable_tracing)
-		ftrace_kill();
+	local_irq_save(flags);
 
 	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
@@ -6056,6 +6052,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
+	/* Did function tracer already get disabled? */
+	if (ftrace_is_dead()) {
+		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+		printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+	}
+
 	/*
 	 * We need to stop all tracing on all CPUS to read the
 	 * the next buffer. This is a bit expensive, but is
@@ -6095,26 +6097,14 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		printk(KERN_TRACE "---------------------------------\n");
 
  out_enable:
-	/* Re-enable tracing if requested */
-	if (!disable_tracing) {
-		trace_flags |= old_userobj;
+	trace_flags |= old_userobj;
 
-		for_each_tracing_cpu(cpu) {
-			atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-		}
-		tracing_on();
+	for_each_tracing_cpu(cpu) {
+		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
-
- out:
-	arch_spin_unlock(&ftrace_dump_lock);
+ 	atomic_dec(&dump_running);
 	local_irq_restore(flags);
 }
-
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
-	__ftrace_dump(true, oops_dump_mode);
-}
 EXPORT_SYMBOL_GPL(ftrace_dump);
 
 __init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 8672c40cb153..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST	100000000
 
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
 		ftrace_graph_stop();
 		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
-		if (ftrace_dump_on_oops)
-			__ftrace_dump(false, DUMP_ALL);
+		if (ftrace_dump_on_oops) {
+			ftrace_dump(DUMP_ALL);
+			/* ftrace_dump() disables tracing */
+			tracing_on();
+		}
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 22f45649ce08642ad7df238d5c25fa5c86bfdd31 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 15 Mar 2013 17:23:20 -0400
Subject: tracing: Update debugfs README file

Update the README file in debugfs/tracing to something more useful.
What's currently in the file is very old and what it shows doesn't
have much use. Heck, it tells you how to mount debugfs! But to read
this file you would have already needed to mount it.

Replace the file with current up-to-date information. It's rather
limited, but what do you expect from a pseudo README file.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 78 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3dc7999594e1..829b2bee24e8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3300,20 +3300,84 @@ static const struct file_operations tracing_iter_fops = {
 
 static const char readme_msg[] =
 	"tracing mini-HOWTO:\n\n"
-	"# mount -t debugfs nodev /sys/kernel/debug\n\n"
-	"# cat /sys/kernel/debug/tracing/available_tracers\n"
-	"wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
-	"# cat /sys/kernel/debug/tracing/current_tracer\n"
-	"nop\n"
-	"# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
-	"# cat /sys/kernel/debug/tracing/current_tracer\n"
-	"wakeup\n"
-	"# cat /sys/kernel/debug/tracing/trace_options\n"
-	"noprint-parent nosym-offset nosym-addr noverbose\n"
-	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
-	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
+	"# echo 0 > tracing_on : quick way to disable tracing\n"
+	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
+	" Important files:\n"
+	"  trace\t\t\t- The static contents of the buffer\n"
+	"\t\t\t  To clear the buffer write into this file: echo > trace\n"
+	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
+	"  current_tracer\t- function and latency tracers\n"
+	"  available_tracers\t- list of configured tracers for current_tracer\n"
+	"  buffer_size_kb\t- view and modify size of per cpu buffer\n"
+	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
+	"  trace_clock\t\t-change the clock used to order events\n"
+	"       local:   Per cpu clock but may not be synced across CPUs\n"
+	"      global:   Synced across CPUs but slows tracing down.\n"
+	"     counter:   Not a clock, but just an increment\n"
+	"      uptime:   Jiffy counter from time of boot\n"
+	"        perf:   Same clock that perf events use\n"
+#ifdef CONFIG_X86_64
+	"     x86-tsc:   TSC cycle counter\n"
+#endif
+	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+	"  tracing_cpumask\t- Limit which CPUs to trace\n"
+	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
+	"\t\t\t  Remove sub-buffer with rmdir\n"
+	"  trace_options\t\t- Set format or modify how tracing happens\n"
+	"\t\t\t  Disable an option by adding a suffix 'no' to the option name\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+	"\n  available_filter_functions - list of functions that can be filtered on\n"
+	"  set_ftrace_filter\t- echo function name in here to only trace these functions\n"
+	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+	"            modules: Can select a group via module\n"
+	"             Format: :mod:<module-name>\n"
+	"             example: echo :mod:ext3 > set_ftrace_filter\n"
+	"            triggers: a command to perform when function is hit\n"
+	"              Format: <function>:<trigger>[:count]\n"
+	"             trigger: traceon, traceoff\n"
+	"                      enable_event:<system>:<event>\n"
+	"                      disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+	"                      stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+	"                      snapshot\n"
+#endif
+	"             example: echo do_fault:traceoff > set_ftrace_filter\n"
+	"                      echo do_trap:traceoff:3 > set_ftrace_filter\n"
+	"             The first one will disable tracing every time do_fault is hit\n"
+	"             The second will disable tracing at most 3 times when do_trap is hit\n"
+	"               The first time do trap is hit and it disables tracing, the counter\n"
+	"               will decrement to 2. If tracing is already disabled, the counter\n"
+	"               will not decrement. It only decrements when the trigger did work\n"
+	"             To remove trigger without count:\n"
+	"               echo '!<function>:<trigger> > set_ftrace_filter\n"
+	"             To remove trigger with a count:\n"
+	"               echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
+	"  set_ftrace_notrace\t- echo function name in here to never trace.\n"
+	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+	"            modules: Can select a group via module command :mod:\n"
+	"            Does not accept triggers\n"
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_TRACER
+	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+	"\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
+	"\t\t\t  Read the contents for more information\n"
+#endif
+#ifdef CONFIG_STACKTRACE
+	"  stack_trace\t\t- Shows the max stack trace when active\n"
+	"  stack_max_size\t- Shows current max stack size that was traced\n"
+	"\t\t\t  Write into this file to reset the max size (trigger a new trace)\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
+#endif
+#endif /* CONFIG_STACKTRACE */
 ;
 
 static ssize_t
-- 
cgit v1.2.3


From 9ca8f8e51003a51382cb4216e9029ee33ccf6c46 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 13 Apr 2012 21:56:43 -0400
Subject: Export blk_fill_rwbs()

Exported so it can be used by bcache's tracepoints

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@redhat.com>
---
 kernel/trace/blktrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..20d8a9825f96 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1828,6 +1828,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 
 	rwbs[i] = '\0';
 }
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
 
 #endif /* CONFIG_EVENT_TRACING */
 
-- 
cgit v1.2.3


From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:06:44 +0800
Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/51624254.30301@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade56981..3f5046a4d81b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ static char *default_bootup_tracer;
 
 static int __init set_cmdline_ftrace(char *str)
 {
-	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
 	ring_buffer_expanded = 1;
@@ -162,7 +162,7 @@ static char *trace_boot_options __initdata;
 
 static int __init set_trace_boot_options(char *str)
 {
-	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
 	trace_boot_options = trace_boot_options_buf;
 	return 0;
 }
-- 
cgit v1.2.3


From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:12:39 +0800
Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf08..db143742704b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
-	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 
 static int __init set_ftrace_filter(char *str)
 {
-	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
-- 
cgit v1.2.3


From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Mar 2013 17:33:00 -0400
Subject: tracing: Fix race with update_max_tr_single and changing tracers

The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers"
fixed the updating of the main buffers with the race of changing
tracers, but left out the fix to the updating of just a per cpu buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade56981..7ba7fc76f9eb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+	if (!current_trace->allocated_snapshot) {
+		/* Only the nop tracer should hit this when disabling */
+		WARN_ON_ONCE(current_trace != &nop_trace);
 		return;
+	}
 
 	arch_spin_lock(&ftrace_max_lock);
 
-- 
cgit v1.2.3


From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 26 Mar 2013 17:53:03 +0100
Subject: ftrace: Consistently restore trace function on sysctl enabling

If we reenable ftrace via syctl, we currently set ftrace_trace_function
based on the previous simplistic algorithm. This is inconsistent with
what update_ftrace_function does. So better call that helper instead.

Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com

Cc: stable@vger.kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf08..cc4943c7ce6d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 		ftrace_startup_sysctl();
 
 		/* we are starting ftrace again */
-		if (ftrace_ops_list != &ftrace_list_end) {
-			if (ftrace_ops_list->next == &ftrace_list_end)
-				ftrace_trace_function = ftrace_ops_list->func;
-			else
-				ftrace_trace_function = ftrace_ops_list_func;
-		}
+		if (ftrace_ops_list != &ftrace_list_end)
+			update_ftrace_function();
 
 	} else {
 		/* stopping ftrace calls (just send to ftrace_stub) */
-- 
cgit v1.2.3


From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Mar 2013 09:31:28 -0400
Subject: ftrace: Do not call stub functions in control loop

The function tracing control loop used by perf spits out a warning
if the called function is not a control function. This is because
the control function references a per cpu allocated data structure
on struct ftrace_ops that is not allocated for other types of
functions.

commit 0a016409e42 "ftrace: Optimize the function tracer list loop"

Had an optimization done to all function tracing loops to optimize
for a single registered ops. Unfortunately, this allows for a slight
race when tracing starts or ends, where the stub function might be
called after the current registered ops is removed. In this case we
get the following dump:

root# perf stat -e ftrace:function sleep 1
[   74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0()
[   74.349522] Hardware name: PRIMERGY RX200 S6
[   74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk
r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li
bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod
[   74.446233] Pid: 1377, comm: perf Tainted: G        W    3.9.0-rc1 #1
[   74.453458] Call Trace:
[   74.456233]  [<ffffffff81062e3f>] warn_slowpath_common+0x7f/0xc0
[   74.462997]  [<ffffffff810fbc60>] ? rcu_note_context_switch+0xa0/0xa0
[   74.470272]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.478117]  [<ffffffff81062e9a>] warn_slowpath_null+0x1a/0x20
[   74.484681]  [<ffffffff81102ede>] ftrace_ops_control_func+0xde/0xf0
[   74.491760]  [<ffffffff8162f400>] ftrace_call+0x5/0x2f
[   74.497511]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.503486]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.509500]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.516088]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.522268]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.528837]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.536696]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.542878]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.548869]  [<ffffffff81105c67>] unregister_ftrace_function+0x27/0x50
[   74.556243]  [<ffffffff8111eadf>] perf_ftrace_event_register+0x9f/0x140
[   74.563709]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.569887]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.575898]  [<ffffffff8111e94e>] perf_trace_destroy+0x2e/0x50
[   74.582505]  [<ffffffff81127ba9>] tp_perf_event_destroy+0x9/0x10
[   74.589298]  [<ffffffff811295d0>] free_event+0x70/0x1a0
[   74.595208]  [<ffffffff8112a579>] perf_event_release_kernel+0x69/0xa0
[   74.602460]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.608667]  [<ffffffff8112a640>] put_event+0x90/0xc0
[   74.614373]  [<ffffffff8112a740>] perf_release+0x10/0x20
[   74.620367]  [<ffffffff811a3044>] __fput+0xf4/0x280
[   74.625894]  [<ffffffff811a31de>] ____fput+0xe/0x10
[   74.631387]  [<ffffffff81083697>] task_work_run+0xa7/0xe0
[   74.637452]  [<ffffffff81014981>] do_notify_resume+0x71/0xb0
[   74.643843]  [<ffffffff8162fa92>] int_signal+0x12/0x17

To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end
ftrace_ops stub as just that, a stub. This flag is now checked in the
control loop and the function is not called if the flag is set.

Thanks to Jovi for not just reporting the bug, but also pointing out
where the bug was in the code.

Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com
Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com

Tested-by: WANG Chao <chaowang@redhat.com>
Reported-by: WANG Chao <chaowang@redhat.com>
Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4943c7ce6d..7e897106b7e0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
 
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 	trace_recursion_set(TRACE_CONTROL_BIT);
 	do_for_each_ftrace_op(op, ftrace_control_list) {
-		if (!ftrace_function_local_disabled(op) &&
+		if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+		    !ftrace_function_local_disabled(op) &&
 		    ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
 	} while_for_each_ftrace_op(op);
-- 
cgit v1.2.3


From 9607a869ee59594f3f7b9f3ac43a11d92bf3f960 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:06:44 +0800
Subject: kernel: tracing: Use strlcpy instead of strncpy

Use strlcpy() instead of strncpy() as it will always add a '\0'
to the end of the string even if the buffer is smaller than what
is being copied.

Link: http://lkml.kernel.org/r/51624254.30301@asianux.com

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++--
 kernel/trace/trace.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25770824598f..548a1f7ea2c1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3496,14 +3496,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
-	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 
 static int __init set_ftrace_filter(char *str)
 {
-	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 829b2bee24e8..07860b995752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,7 +125,7 @@ static bool allocate_snapshot;
 
 static int __init set_cmdline_ftrace(char *str)
 {
-	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
 	ring_buffer_expanded = true;
@@ -164,7 +164,7 @@ static char *trace_boot_options __initdata;
 
 static int __init set_trace_boot_options(char *str)
 {
-	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
 	trace_boot_options = trace_boot_options_buf;
 	return 0;
 }
-- 
cgit v1.2.3


From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Mon, 1 Apr 2013 21:46:23 +0900
Subject: tracing: Fix double free when function profile init failed

On the failure path, stat->start and stat->pages will refer same page.
So it'll attempt to free the same page again and get kernel panic.

Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e897106b7e0..926ebfb74936 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 		free_page(tmp);
 	}
 
-	free_page((unsigned long)stat->pages);
 	stat->pages = NULL;
 	stat->start = NULL;
 
-- 
cgit v1.2.3


From 39e30cd1537937d3c00ef87e865324e981434e5b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Mon, 1 Apr 2013 21:46:24 +0900
Subject: tracing: Fix off-by-one on allocating stat->pages

The first page was allocated separately, so no need to start from 0.

Link: http://lkml.kernel.org/r/1364820385-32027-2-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 548a1f7ea2c1..c9f31491009f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 
 	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
 
-	for (i = 0; i < pages; i++) {
+	for (i = 1; i < pages; i++) {
 		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
 		if (!pg->next)
 			goto out_free;
-- 
cgit v1.2.3


From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 11 Apr 2013 15:55:01 +0900
Subject: tracing: Fix possible NULL pointer dereferences

Currently set_ftrace_pid and set_graph_function files use seq_lseek
for their fops.  However seq_open() is called only for FMODE_READ in
the fops->open() so that if an user tries to seek one of those file
when she open it for writing, it sees NULL seq_file and then panic.

It can be easily reproduced with following command:

  $ cd /sys/kernel/debug/tracing
  $ echo 1234 | sudo tee -a set_ftrace_pid

In this example, GNU coreutils' tee opens the file with fopen(, "a")
and then the fopen() internally calls lseek().

Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c      | 10 +++++-----
 kernel/trace/trace_stack.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 926ebfb74936..affc35d829cc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 }
 
 loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t ret;
 
@@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = seq_read,
 	.write = ftrace_notrace_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
 	.open		= ftrace_graph_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_graph_release,
-	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.open		= ftrace_pid_open,
 	.write		= ftrace_pid_write,
 	.read		= seq_read,
-	.llseek		= seq_lseek,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_pid_release,
 };
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..83a8b5b7bd35 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
 	.open = stack_trace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
-- 
cgit v1.2.3


From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Apr 2013 16:40:13 -0400
Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section

As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to
be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the
ftrace_pid_fops is defined when DYNAMIC_FTRACE is not.

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index affc35d829cc..2461ede45a8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, whence);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
  * routine, you can use ftrace_filter_write() for the write
  * routine if @flag has FTRACE_ITER_FILTER set, or
  * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
  * release must call ftrace_regex_release().
  */
 int
@@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, whence);
-	else
-		file->f_pos = ret = 1;
-
-	return ret;
-}
-
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
 	int matched = 0;
-- 
cgit v1.2.3


From 9f50afccfdc15d95d7331acddcb0f7703df089ae Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 11 Apr 2013 16:01:38 +0900
Subject: tracing: Reset ftrace_graph_filter_enabled if count is zero

The ftrace_graph_count can be decreased with a "!" pattern, so that
the enabled flag should be updated too.

Link: http://lkml.kernel.org/r/1365663698-2413-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c9f31491009f..9e3198782507 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3792,7 +3792,8 @@ out:
 	if (fail)
 		return -EINVAL;
 
-	ftrace_graph_filter_enabled = 1;
+	ftrace_graph_filter_enabled = !!(*idx);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From f1943977e6648c1d42a78eda4ba4429a2bc0b786 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Wed, 10 Apr 2013 09:18:11 +0900
Subject: tracing: Get rid of unneeded key calculation in ftrace_hash_move()

It's not used anywhere in the function.

Link: http://lkml.kernel.org/r/1365553093-10180-1-git-send-email-namhyung@kernel.org

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e3198782507..3b84fc100788 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1320,7 +1320,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 	struct hlist_head *hhd;
 	struct ftrace_hash *old_hash;
 	struct ftrace_hash *new_hash;
-	unsigned long key;
 	int size = src->count;
 	int bits = 0;
 	int ret;
@@ -1363,10 +1362,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 	for (i = 0; i < size; i++) {
 		hhd = &src->buckets[i];
 		hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
-			if (bits > 0)
-				key = hash_long(entry->ip, bits);
-			else
-				key = 0;
 			remove_hash_entry(src, entry);
 			__add_hash_entry(new_hash, entry);
 		}
-- 
cgit v1.2.3


From ed6f1c996bfe4b6e520cf7a74b51cd6988d84420 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Wed, 10 Apr 2013 09:18:12 +0900
Subject: tracing: Check return value of tracing_init_dentry()

Check return value and bail out if it's NULL.

Link: http://lkml.kernel.org/r/1365553093-10180-2-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c       | 2 ++
 kernel/trace/trace_stack.c | 2 ++
 kernel/trace/trace_stat.c  | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 07860b995752..72970793b40a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5953,6 +5953,8 @@ static __init int tracer_init_debugfs(void)
 	trace_access_lock_init();
 
 	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
 
 	init_tracer_debugfs(&global_trace, d_tracer);
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index aab277b67fa9..8c3f37e2dc43 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -431,6 +431,8 @@ static __init int stack_trace_init(void)
 	struct dentry *d_tracer;
 
 	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
 
 	trace_create_file("stack_max_size", 0644, d_tracer,
 			&max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
 	struct dentry *d_tracing;
 
 	d_tracing = tracing_init_dentry();
+	if (!d_tracing)
+		return 0;
 
 	stat_dir = debugfs_create_dir("trace_stat", d_tracing);
 	if (!stat_dir)
-- 
cgit v1.2.3


From 20079ebe73c16b34621abd2993f3d48e2f9336b7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Wed, 10 Apr 2013 08:55:50 +0900
Subject: ftrace: Get rid of ftrace_profile_bits

It seems that function profiler's hash size is fixed at 1024.  Add and
use FTRACE_PROFILE_HASH_BITS instead and update hash size macro.

Link: http://lkml.kernel.org/r/1365551750-4504-1-git-send-email-namhyung@kernel.org

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3b84fc100788..9b44abb2c5a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static int ftrace_profile_bits __read_mostly;
 static int ftrace_profile_enabled __read_mostly;
 
 /* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
 
 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
 
-#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+#define FTRACE_PROFILE_HASH_BITS 10
+#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
 
 static void *
 function_stat_next(void *v, int idx)
@@ -725,13 +725,6 @@ static int ftrace_profile_init_cpu(int cpu)
 	if (!stat->hash)
 		return -ENOMEM;
 
-	if (!ftrace_profile_bits) {
-		size--;
-
-		for (; size; size >>= 1)
-			ftrace_profile_bits++;
-	}
-
 	/* Preallocate the function profiling pages */
 	if (ftrace_profile_pages_init(stat) < 0) {
 		kfree(stat->hash);
@@ -765,7 +758,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 	struct hlist_node *n;
 	unsigned long key;
 
-	key = hash_long(ip, ftrace_profile_bits);
+	key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
 	hhd = &stat->hash[key];
 
 	if (hlist_empty(hhd))
@@ -784,7 +777,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
 {
 	unsigned long key;
 
-	key = hash_long(rec->ip, ftrace_profile_bits);
+	key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
 	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
-- 
cgit v1.2.3


From 07720b63a964851928fa5d8b00ee5270d66b94f7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 28 Mar 2013 18:01:04 +0100
Subject: uprobes/tracing: Kill the pointless task_pt_regs() calls

uprobe_trace_func() and uprobe_perf_func() do not need task_pt_regs(),
we already have "struct pt_regs *regs".

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee9..af5b7735cebf 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -507,7 +507,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 		return 0;
 
 	entry = ring_buffer_event_data(event);
-	entry->ip = instruction_pointer(task_pt_regs(current));
+	entry->ip = instruction_pointer(regs);
 	data = (u8 *)&entry[1];
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -777,7 +777,7 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	if (!entry)
 		goto out;
 
-	entry->ip = instruction_pointer(task_pt_regs(current));
+	entry->ip = instruction_pointer(regs);
 	data = (u8 *)&entry[1];
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
-- 
cgit v1.2.3


From 456fdbcb8648a20f56977efbc6f13e28936fcf0b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 28 Mar 2013 18:58:11 +0100
Subject: uprobes/tracing: Kill the pointless seq_print_ip_sym() call

seq_print_ip_sym(ip) in print_uprobe_event() is pointless,
kallsyms_lookup(ip) can not resolve a user-space address.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index af5b7735cebf..8e009016e4ba 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -531,13 +531,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
 	field = (struct uprobe_trace_entry_head *)iter->ent;
 	tu = container_of(event, struct trace_uprobe, call.event);
 
-	if (!trace_seq_printf(s, "%s: (", tu->call.name))
-		goto partial;
-
-	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
-		goto partial;
-
-	if (!trace_seq_puts(s, ")"))
+	if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, field->ip))
 		goto partial;
 
 	data = (u8 *)&field[1];
-- 
cgit v1.2.3


From 0e3853d202e8b2720bc4c674dc58849b2662c8f8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 28 Mar 2013 19:19:11 +0100
Subject: uprobes/tracing: Kill the pointless local_save_flags/preempt_count
 calls

uprobe_trace_func() is never called with irqs or preemption
disabled, no need to ask preempt_count() or local_save_flags().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8e009016e4ba..43d258db998e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -492,17 +492,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	u8 *data;
-	int size, i, pc;
-	unsigned long irq_flags;
+	int size, i;
 	struct ftrace_event_call *call = &tu->call;
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
-
 	size = sizeof(*entry) + tu->size;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, irq_flags, pc);
+						  size, 0, 0);
 	if (!event)
 		return 0;
 
@@ -513,7 +509,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
-		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+		trace_buffer_unlock_commit(buffer, event, 0, 0);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 457d1772f1c1bcf37b2ae7fc8f1d6f303d1d5cf9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 29 Mar 2013 18:26:51 +0100
Subject: uprobes/tracing: Generalize struct uprobe_trace_entry_head

struct uprobe_trace_entry_head has a single member for reporting,
"unsigned long ip". If we want to support uretprobes we need to
create another struct which has "func" and "ret_ip" and duplicate
a lot of functions, like trace_kprobe.c does.

To avoid this copy-and-paste horror we turn ->ip into ->vaddr[]
and add couple of trivial helpers to calculate sizeof/data. This
uglifies the code a bit, but this allows us to avoid a lot more
complications later, when we add the support for ret-probes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace.h        |  5 ----
 kernel/trace/trace_uprobe.c | 62 ++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2081971367ea..8bed1dfcb938 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,11 +103,6 @@ struct kretprobe_trace_entry_head {
 	unsigned long		ret_ip;
 };
 
-struct uprobe_trace_entry_head {
-	struct trace_entry	ent;
-	unsigned long		ip;
-};
-
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 43d258db998e..49b400368bfd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
 
 #define UPROBE_EVENT_SYSTEM	"uprobes"
 
+struct uprobe_trace_entry_head {
+	struct trace_entry	ent;
+	unsigned long		vaddr[];
+};
+
+#define SIZEOF_TRACE_ENTRY(is_return)			\
+	(sizeof(struct uprobe_trace_entry_head) +	\
+	 sizeof(unsigned long) * (is_return ? 2 : 1))
+
+#define DATAOF_TRACE_ENTRY(entry, is_return)		\
+	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
+
 struct trace_uprobe_filter {
 	rwlock_t		rwlock;
 	int			nr_systemwide;
@@ -491,20 +503,19 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	u8 *data;
+	void *data;
 	int size, i;
 	struct ftrace_event_call *call = &tu->call;
 
-	size = sizeof(*entry) + tu->size;
-
+	size = SIZEOF_TRACE_ENTRY(false) + tu->size;
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, 0, 0);
 	if (!event)
 		return 0;
 
 	entry = ring_buffer_event_data(event);
-	entry->ip = instruction_pointer(regs);
-	data = (u8 *)&entry[1];
+	entry->vaddr[0] = instruction_pointer(regs);
+	data = DATAOF_TRACE_ENTRY(entry, false);
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
@@ -518,22 +529,22 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 static enum print_line_t
 print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
 {
-	struct uprobe_trace_entry_head *field;
+	struct uprobe_trace_entry_head *entry;
 	struct trace_seq *s = &iter->seq;
 	struct trace_uprobe *tu;
 	u8 *data;
 	int i;
 
-	field = (struct uprobe_trace_entry_head *)iter->ent;
+	entry = (struct uprobe_trace_entry_head *)iter->ent;
 	tu = container_of(event, struct trace_uprobe, call.event);
 
-	if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, field->ip))
+	if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, entry->vaddr[0]))
 		goto partial;
 
-	data = (u8 *)&field[1];
+	data = DATAOF_TRACE_ENTRY(entry, false);
 	for (i = 0; i < tu->nr_args; i++) {
 		if (!tu->args[i].type->print(s, tu->args[i].name,
-					     data + tu->args[i].offset, field))
+					     data + tu->args[i].offset, entry))
 			goto partial;
 	}
 
@@ -585,16 +596,17 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
 
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
-	int ret, i;
+	int ret, i, size;
 	struct uprobe_trace_entry_head field;
-	struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+	struct trace_uprobe *tu = event_call->data;
 
-	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+	DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+	size = SIZEOF_TRACE_ENTRY(false);
 	/* Set argument names as fields */
 	for (i = 0; i < tu->nr_args; i++) {
 		ret = trace_define_field(event_call, tu->args[i].type->fmttype,
 					 tu->args[i].name,
-					 sizeof(field) + tu->args[i].offset,
+					 size + tu->args[i].offset,
 					 tu->args[i].type->size,
 					 tu->args[i].type->is_signed,
 					 FILTER_OTHER);
@@ -748,33 +760,31 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	struct ftrace_event_call *call = &tu->call;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
-	u8 *data;
-	int size, __size, i;
-	int rctx;
+	unsigned long ip;
+	void *data;
+	int size, rctx, i;
 
 	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
 		return UPROBE_HANDLER_REMOVE;
 
-	__size = sizeof(*entry) + tu->size;
-	size = ALIGN(__size + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = SIZEOF_TRACE_ENTRY(false);
+	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
 		return 0;
 
 	preempt_disable();
-
 	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
 	if (!entry)
 		goto out;
 
-	entry->ip = instruction_pointer(regs);
-	data = (u8 *)&entry[1];
+	ip = instruction_pointer(regs);
+	entry->vaddr[0] = ip;
+	data = DATAOF_TRACE_ENTRY(entry, false);
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
-
+	perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL);
  out:
 	preempt_enable();
 	return 0;
@@ -784,7 +794,7 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 static
 int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
 {
-	struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+	struct trace_uprobe *tu = event->data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
-- 
cgit v1.2.3


From a51cc6041773dd88ff35608f54274bfd6ac68652 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 18:02:12 +0100
Subject: uprobes/tracing: Introduce uprobe_{trace,perf}_print() helpers

Extract the output code from uprobe_trace_func() and uprobe_perf_func()
into the new helpers, they will be used by ->ret_handler() too. We also
add the unused "unsigned long func" argument in advance, to simplify the
next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 49b400368bfd..0c0f0a7d4ff1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -497,8 +497,8 @@ static const struct file_operations uprobe_profile_ops = {
 	.release	= seq_release,
 };
 
-/* uprobe handler */
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_trace_print(struct trace_uprobe *tu,
+				unsigned long func, struct pt_regs *regs)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -511,7 +511,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, 0, 0);
 	if (!event)
-		return 0;
+		return;
 
 	entry = ring_buffer_event_data(event);
 	entry->vaddr[0] = instruction_pointer(regs);
@@ -521,7 +521,12 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_buffer_unlock_commit(buffer, event, 0, 0);
+}
 
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+	uprobe_trace_print(tu, 0, regs);
 	return 0;
 }
 
@@ -754,8 +759,8 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 	return ret;
 }
 
-/* uprobe profile handler */
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_perf_print(struct trace_uprobe *tu,
+				unsigned long func, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tu->call;
 	struct uprobe_trace_entry_head *entry;
@@ -764,13 +769,10 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	void *data;
 	int size, rctx, i;
 
-	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
-		return UPROBE_HANDLER_REMOVE;
-
 	size = SIZEOF_TRACE_ENTRY(false);
 	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-		return 0;
+		return;
 
 	preempt_disable();
 	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
@@ -787,6 +789,15 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL);
  out:
 	preempt_enable();
+}
+
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+		return UPROBE_HANDLER_REMOVE;
+
+	uprobe_perf_print(tu, 0, regs);
 	return 0;
 }
 #endif	/* CONFIG_PERF_EVENTS */
-- 
cgit v1.2.3


From c1ae5c75e1034070b203dc9d4ad77ce196166a6c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 18:25:23 +0100
Subject: uprobes/tracing: Introduce is_ret_probe() and uretprobe_dispatcher()

Create the new functions we need to support uretprobes, and change
alloc_trace_uprobe() to initialize consumer.ret_handler if the new
"is_ret" argument is true. Curently this argument is always false,
so the new code is never called and is_ret_probe(tu) is false too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0c0f0a7d4ff1..72aa45e50d48 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -76,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
 static LIST_HEAD(uprobe_list);
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+				unsigned long func, struct pt_regs *regs);
 
 static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 {
@@ -89,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
 	return !filter->nr_systemwide && list_empty(&filter->perf_events);
 }
 
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+	return tu->consumer.ret_handler != NULL;
+}
+
 /*
  * Allocate new trace_uprobe and initialize it (including uprobes).
  */
 static struct trace_uprobe *
-alloc_trace_uprobe(const char *group, const char *event, int nargs)
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 {
 	struct trace_uprobe *tu;
 
@@ -118,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
 
 	INIT_LIST_HEAD(&tu->list);
 	tu->consumer.handler = uprobe_dispatcher;
+	if (is_ret)
+		tu->consumer.ret_handler = uretprobe_dispatcher;
 	init_trace_uprobe_filter(&tu->filter);
 	return tu;
 
@@ -315,7 +324,7 @@ static int create_trace_uprobe(int argc, char **argv)
 		kfree(tail);
 	}
 
-	tu = alloc_trace_uprobe(group, event, argc);
+	tu = alloc_trace_uprobe(group, event, argc, false);
 	if (IS_ERR(tu)) {
 		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
 		ret = PTR_ERR(tu);
@@ -530,6 +539,12 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	return 0;
 }
 
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+				struct pt_regs *regs)
+{
+	uprobe_trace_print(tu, func, regs);
+}
+
 /* Event entry printers */
 static enum print_line_t
 print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
@@ -800,6 +815,12 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	uprobe_perf_print(tu, 0, regs);
 	return 0;
 }
+
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+				struct pt_regs *regs)
+{
+	uprobe_perf_print(tu, func, regs);
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static
@@ -854,6 +875,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 	return ret;
 }
 
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+				unsigned long func, struct pt_regs *regs)
+{
+	struct trace_uprobe *tu;
+
+	tu = container_of(con, struct trace_uprobe, consumer);
+
+	if (tu->flags & TP_FLAG_TRACE)
+		uretprobe_trace_func(tu, func, regs);
+
+#ifdef CONFIG_PERF_EVENTS
+	if (tu->flags & TP_FLAG_PROFILE)
+		uretprobe_perf_func(tu, func, regs);
+#endif
+	return 0;
+}
+
 static struct trace_event_functions uprobe_funcs = {
 	.trace		= print_uprobe_event
 };
-- 
cgit v1.2.3


From 393a736c280d555d9301fc5516d4d401544eb9db Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 18:46:22 +0100
Subject: uprobes/tracing: Make uprobe_{trace,perf}_print() uretprobe-friendly

Change uprobe_trace_print() and uprobe_perf_print() to check
is_ret_probe() and fill ring_buffer_event accordingly.

Also change uprobe_trace_func() and uprobe_perf_func() to not
_print() if is_ret_probe() is true. Note that we keep ->handler()
nontrivial even for uretprobe, we need this for filtering and for
other potential extensions.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 72aa45e50d48..0ed99a27d122 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -516,15 +516,22 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
 	int size, i;
 	struct ftrace_event_call *call = &tu->call;
 
-	size = SIZEOF_TRACE_ENTRY(false) + tu->size;
+	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, 0, 0);
+						  size + tu->size, 0, 0);
 	if (!event)
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->vaddr[0] = instruction_pointer(regs);
-	data = DATAOF_TRACE_ENTRY(entry, false);
+	if (is_ret_probe(tu)) {
+		entry->vaddr[0] = func;
+		entry->vaddr[1] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		entry->vaddr[0] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
+
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
@@ -535,7 +542,8 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
 /* uprobe handler */
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
-	uprobe_trace_print(tu, 0, regs);
+	if (!is_ret_probe(tu))
+		uprobe_trace_print(tu, 0, regs);
 	return 0;
 }
 
@@ -784,7 +792,7 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 	void *data;
 	int size, rctx, i;
 
-	size = SIZEOF_TRACE_ENTRY(false);
+	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
 		return;
@@ -795,8 +803,15 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 		goto out;
 
 	ip = instruction_pointer(regs);
-	entry->vaddr[0] = ip;
-	data = DATAOF_TRACE_ENTRY(entry, false);
+	if (is_ret_probe(tu)) {
+		entry->vaddr[0] = func;
+		entry->vaddr[1] = ip;
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		entry->vaddr[0] = ip;
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
+
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
@@ -812,7 +827,8 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
 		return UPROBE_HANDLER_REMOVE;
 
-	uprobe_perf_print(tu, 0, regs);
+	if (!is_ret_probe(tu))
+		uprobe_perf_print(tu, 0, regs);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4d1298e2124767b4e263498485618b2e91aee5f0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 19:23:15 +0100
Subject: uprobes/tracing: Make register_uprobe_event() paths
 uretprobe-friendly

Change uprobe_event_define_fields(), and __set_print_fmt() to check
is_ret_probe() and use the appropriate format/fields.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0ed99a27d122..457576215b3a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -628,8 +628,14 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
 	struct uprobe_trace_entry_head field;
 	struct trace_uprobe *tu = event_call->data;
 
-	DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
-	size = SIZEOF_TRACE_ENTRY(false);
+	if (is_ret_probe(tu)) {
+		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+		DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+		size = SIZEOF_TRACE_ENTRY(true);
+	} else {
+		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+		size = SIZEOF_TRACE_ENTRY(false);
+	}
 	/* Set argument names as fields */
 	for (i = 0; i < tu->nr_args; i++) {
 		ret = trace_define_field(event_call, tu->args[i].type->fmttype,
@@ -652,8 +658,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
 	int i;
 	int pos = 0;
 
-	fmt = "(%lx)";
-	arg = "REC->" FIELD_STRING_IP;
+	if (is_ret_probe(tu)) {
+		fmt = "(%lx <- %lx)";
+		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+	} else {
+		fmt = "(%lx)";
+		arg = "REC->" FIELD_STRING_IP;
+	}
 
 	/* When len=0, we just calculate the needed length */
 
-- 
cgit v1.2.3


From 3ede82dd3e3deb23429f2bf44fb600f440eef84b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 19:48:09 +0100
Subject: uprobes/tracing: Make seq_printf() code uretprobe-friendly

Change probes_seq_show() and print_uprobe_event() to check
is_ret_probe() and print the correct data.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 457576215b3a..8c9f4894bcc0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -435,9 +435,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_uprobe *tu = v;
+	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 
-	seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+	seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 
 	for (i = 0; i < tu->nr_args; i++)
@@ -566,10 +567,18 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
 	entry = (struct uprobe_trace_entry_head *)iter->ent;
 	tu = container_of(event, struct trace_uprobe, call.event);
 
-	if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, entry->vaddr[0]))
-		goto partial;
+	if (is_ret_probe(tu)) {
+		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
+					entry->vaddr[1], entry->vaddr[0]))
+			goto partial;
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
+					entry->vaddr[0]))
+			goto partial;
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
 
-	data = DATAOF_TRACE_ENTRY(entry, false);
 	for (i = 0; i < tu->nr_args; i++) {
 		if (!tu->args[i].type->print(s, tu->args[i].name,
 					     data + tu->args[i].offset, entry))
-- 
cgit v1.2.3


From 4ee5a52ed6301d0afa56cc995ef2c3795f45e801 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 Mar 2013 20:28:15 +0100
Subject: uprobes/tracing: Change create_trace_uprobe() to support uretprobes

Finally change create_trace_uprobe() to check if argv[0][0] == 'r'
and pass the correct "is_ret" to alloc_trace_uprobe().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Anton Arapov <anton@redhat.com>
---
 kernel/trace/trace_uprobe.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8c9f4894bcc0..2d08bea638a4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -201,7 +201,7 @@ end:
 
 /*
  * Argument syntax:
- *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
  *
  *  - Remove uprobe: -:[GRP/]EVENT
  */
@@ -213,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
 	unsigned long offset;
-	bool is_delete;
+	bool is_delete, is_return;
 	int i, ret;
 
 	inode = NULL;
 	ret = 0;
 	is_delete = false;
+	is_return = false;
 	event = NULL;
 	group = NULL;
 
 	/* argc must be >= 1 */
 	if (argv[0][0] == '-')
 		is_delete = true;
+	else if (argv[0][0] == 'r')
+		is_return = true;
 	else if (argv[0][0] != 'p') {
-		pr_info("Probe definition must be started with 'p' or '-'.\n");
+		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
 		return -EINVAL;
 	}
 
@@ -324,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
 		kfree(tail);
 	}
 
-	tu = alloc_trace_uprobe(group, event, argc, false);
+	tu = alloc_trace_uprobe(group, event, argc, is_return);
 	if (IS_ERR(tu)) {
 		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
 		ret = PTR_ERR(tu);
-- 
cgit v1.2.3


From 32520b2c695b23221751eb09360a6a3dd3105b52 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Apr 2013 16:25:49 +0200
Subject: uprobes/tracing: Don't pass addr=ip to perf_trace_buf_submit()

uprobe_perf_print() passes addr=ip to perf_trace_buf_submit() for
no reason. This sets perf_sample_data->addr for PERF_SAMPLE_ADDR,
we already have perf_sample_data->ip initialized if PERF_SAMPLE_IP.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/trace/trace_uprobe.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2d08bea638a4..37ccb72f0f3b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -811,7 +811,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 	struct ftrace_event_call *call = &tu->call;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
-	unsigned long ip;
 	void *data;
 	int size, rctx, i;
 
@@ -825,13 +824,12 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 	if (!entry)
 		goto out;
 
-	ip = instruction_pointer(regs);
 	if (is_ret_probe(tu)) {
 		entry->vaddr[0] = func;
-		entry->vaddr[1] = ip;
+		entry->vaddr[1] = instruction_pointer(regs);
 		data = DATAOF_TRACE_ENTRY(entry, true);
 	} else {
-		entry->vaddr[0] = ip;
+		entry->vaddr[0] = instruction_pointer(regs);
 		data = DATAOF_TRACE_ENTRY(entry, false);
 	}
 
@@ -839,7 +837,7 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 515619f209114697fabd21eed1623bfa69746815 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 13 Apr 2013 15:36:49 +0200
Subject: uprobes/perf: Avoid perf_trace_buf_prepare/submit if ->perf_events is
 empty

perf_trace_buf_prepare() + perf_trace_buf_submit() make no sense
if this task/CPU has no active counters. Change uprobe_perf_print()
to return if hlist_empty(call->perf_events).

Note: this is not uprobe-specific, we can change other users too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/trace/trace_uprobe.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 37ccb72f0f3b..32494fb0ee64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -820,6 +820,10 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 		return;
 
 	preempt_disable();
+	head = this_cpu_ptr(call->perf_events);
+	if (hlist_empty(head))
+		goto out;
+
 	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
 	if (!entry)
 		goto out;
@@ -836,7 +840,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
-	head = this_cpu_ptr(call->perf_events);
 	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
  out:
 	preempt_enable();
-- 
cgit v1.2.3


From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 18 Apr 2013 09:00:26 -0700
Subject: Revert "block: add missing block_bio_complete() tracepoint"

This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f.

Wanlong Gao reports that it causes a kernel panic on his machine several
minutes after boot. Reverting it removes the panic.

Jens says:
 "It's not quite clear why that is yet, so I think we should just revert
  the commit for 3.9 final (which I'm assuming is pretty close).

  The wifi is crap at the LSF hotel, so sending this email instead of
  queueing up a revert and pull request."

Reported-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Requested-by: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/blktrace.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..5a0f781cd729 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
 				      struct request_queue *q,
 				      struct request *rq)
 {
-	struct blk_trace *bt = q->blk_trace;
-
-	/* if control ever passes through here, it's a request based driver */
-	if (unlikely(bt && !bt->rq_based))
-		bt->rq_based = true;
-
 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
 
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+				       struct request_queue *q, struct bio *bio,
+				       int error)
 {
-	struct request_queue *q;
-	struct blk_trace *bt;
-
-	if (!bio->bi_bdev)
-		return;
-
-	q = bdev_get_queue(bio->bi_bdev);
-	bt = q->blk_trace;
-
-	/*
-	 * Request based drivers will generate both rq and bio completions.
-	 * Ignore bio ones.
-	 */
-	if (likely(!bt) || bt->rq_based)
-		return;
-
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
-- 
cgit v1.2.3


From 6c24499f40d96bf07a85b709fb1bee5cea611a1d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 29 Apr 2013 20:08:14 -0400
Subject: tracing: Fix small merge bug

During the 3.10 merge, a conflict happened and the resolution was
almost, but not quite, correct. An if statement was reversed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[ Duh. That was just silly of me  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 581630a6387d..ae6fa2d1cdf7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -904,7 +904,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (tr->allocated_snapshot) {
+	if (!tr->allocated_snapshot) {
 		/* Only the nop tracer should hit this when disabling */
 		WARN_ON_ONCE(tr->current_trace != &nop_trace);
 		return;
-- 
cgit v1.2.3


From 2228768885e0b92c0f7b276cc61b8974e7aed724 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 3 May 2013 11:16:18 -0400
Subject: ring-buffer: Select IRQ_WORK

As the wake up logic for waiters on the buffer has been moved
from the tracing code to the ring buffer, it requires also adding
IRQ_WORK as the wake up code is performed via irq_work.

This fixes compile breakage when a user of the ring buffer is selected
but tracing and irq_work are not.

Link http://lkml.kernel.org/r/20130503115332.GT8356@rric.localhost

Cc: Arnd Bergmann <arnd@arndb.de>
Reported-by: Robert Richter <rric@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5e9efd4b83a4..015f85aaca08 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -71,6 +71,7 @@ config TRACE_CLOCK
 config RING_BUFFER
 	bool
 	select TRACE_CLOCK
+	select IRQ_WORK
 
 config FTRACE_NMI_ENTER
        bool
@@ -107,7 +108,6 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 	select TRACE_CLOCK
-	select IRQ_WORK
 
 config GENERIC_TRACER
 	bool
-- 
cgit v1.2.3


From a5b85bd1557209b4ef18a8cf07e60a1ca3132468 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:14 +0900
Subject: tracing: Don't succeed if event_enable_func did not register anything

Return 0 instead of the number of activated ftrace function probes if
event_enable_func succeeded and return an error code if it failed or
did not register any functions. But it currently returns the number
of registered functions and if it didn't register anything, it returns 0,
but that is considered success.

This also fixes the return value. As if it succeeds, it returns the
number of functions that were enabled, which is returned back to
the user in ftrace_regex_write (the write() return code). If only
one function is enabled, then the return code of the write is one,
and this can confuse the user program in thinking it only wrote 1
byte.

Link: http://lkml.kernel.org/r/20130509054413.30398.55650.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ Rewrote change log to reflect that this fixes two bugs - SR ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53582e982e51..44ac83614c3d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2061,8 +2061,11 @@ event_enable_func(struct ftrace_hash *hash,
 	if (ret < 0)
 		goto out_put;
 	ret = register_ftrace_function_probe(glob, ops, data);
-	if (!ret)
+	if (!ret) {
+		ret = -ENOENT;
 		goto out_disable;
+	} else
+		ret = 0;
  out:
 	mutex_unlock(&event_mutex);
 	return ret;
-- 
cgit v1.2.3


From ff305ded9ff83436039a16d31bc558dc6598d7ce Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 11:30:26 -0400
Subject: tracing: Return error if register_ftrace_function_probe() fails for
 event_enable_func()

register_ftrace_function_probe() returns the number of functions
it registered, which can be zero, it can also return a negative number
if something went wrong. But event_enable_func() only checks for
the case that it didn't register anything, it needs to also check
for the case that something went wrong and return that error code
as well.

Added some comments about the code as well, to make it more
understandable.

Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 44ac83614c3d..87e826f1c237 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2061,11 +2061,18 @@ event_enable_func(struct ftrace_hash *hash,
 	if (ret < 0)
 		goto out_put;
 	ret = register_ftrace_function_probe(glob, ops, data);
+	/*
+	 * The above returns on success the # of functions enabled,
+	 * but if it didn't find any functions it returns zero.
+	 * Consider no functions a failure too.
+	 */
 	if (!ret) {
 		ret = -ENOENT;
 		goto out_disable;
-	} else
-		ret = 0;
+	} else if (ret < 0)
+		goto out_disable;
+	/* Just return zero, not the number of enabled functions */
+	ret = 0;
  out:
 	mutex_unlock(&event_mutex);
 	return ret;
-- 
cgit v1.2.3


From 7c088b5120ffef017e2ddc38f992277e96436ef6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 11:35:12 -0400
Subject: ftrace: Have ftrace_regex_write() return either read or error

As ftrace_regex_write() reads the result of ftrace_process_regex()
which can sometimes return a positive number, only consider a
failure if the return is negative. Otherwise, it will skip possible
other registered probes and by returning a positive number that
wasn't read, it will confuse the user processes doing the writing.

Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8a5c017bb50c..d85a0ad81a67 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3276,7 +3276,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 		ret = ftrace_process_regex(iter->hash, parser->buffer,
 					   parser->idx, enable);
 		trace_parser_clear(parser);
-		if (ret)
+		if (ret < 0)
 			goto out_unlock;
 	}
 
-- 
cgit v1.2.3


From f04f24fb7e48d446bd89a01c6056571f25972511 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:17 +0900
Subject: ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

Fix a deadlock on ftrace_regex_lock which happens when setting
an enable_event trigger on dynamic kprobe event as below.

----
sh-2.05b# echo p vfs_symlink > kprobe_events
sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter

=============================================
[ INFO: possible recursive locking detected ]
3.9.0+ #35 Not tainted
---------------------------------------------
sh/72 is trying to acquire lock:
 (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810ba6c1>] ftrace_set_hash+0x81/0x1f0

but task is already holding lock:
 (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810b7cbd>] ftrace_regex_write.isra.29.part.30+0x3d/0x220

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(ftrace_regex_lock);
  lock(ftrace_regex_lock);

 *** DEADLOCK ***
----

To fix that, this introduces a finer regex_lock for each ftrace_ops.
ftrace_regex_lock is too big of a lock which protects all
filter/notrace_hash operations, but it doesn't need to be a global
lock after supporting multiple ftrace_ops because each ftrace_ops
has its own filter/notrace_hash.

Link: http://lkml.kernel.org/r/20130509054417.30398.84254.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ Added initialization flag and automate mutex initialization for
  non ftrace.c ftrace_probes. ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 73 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 21 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d85a0ad81a67..827f2fe7bc3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,6 +64,13 @@
 
 #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_REGEX_LOCK(opsname)	\
+	.regex_lock	= __MUTEX_INITIALIZER(opsname.regex_lock),
+#else
+#define INIT_REGEX_LOCK(opsname)
+#endif
+
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
 	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
@@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 	while (likely(op = rcu_dereference_raw((op)->next)) &&	\
 	       unlikely((op) != &ftrace_list_end))
 
+static inline void ftrace_ops_init(struct ftrace_ops *ops)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+		mutex_init(&ops->regex_lock);
+		ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+	}
+#endif
+}
+
 /**
  * ftrace_nr_registered_ops - return number of ops registered
  *
@@ -907,7 +924,8 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
 	.func		= function_profile_call,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(ftrace_profile_ops)
 };
 
 static int register_ftrace_profiler(void)
@@ -1103,11 +1121,10 @@ static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
 	.notrace_hash		= EMPTY_HASH,
 	.filter_hash		= EMPTY_HASH,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(global_ops)
 };
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
 struct ftrace_page {
 	struct ftrace_page	*next;
 	struct dyn_ftrace	*records;
@@ -1247,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
 
 void ftrace_free_filter(struct ftrace_ops *ops)
 {
+	ftrace_ops_init(ops);
 	free_ftrace_hash(ops->filter_hash);
 	free_ftrace_hash(ops->notrace_hash);
 }
@@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 	struct ftrace_hash *hash;
 	int ret = 0;
 
+	ftrace_ops_init(ops);
+
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
@@ -2656,7 +2676,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		}
 	}
 
-	mutex_lock(&ftrace_regex_lock);
+	mutex_lock(&ops->regex_lock);
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
@@ -2677,7 +2697,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		}
 	} else
 		file->private_data = iter;
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&ops->regex_lock);
 
 	return ret;
 }
@@ -2910,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
 	.func		= function_trace_probe_call,
+	.flags		= FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(trace_probe_ops)
 };
 
 static int ftrace_probe_registered;
@@ -3256,18 +3278,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (!cnt)
 		return 0;
 
-	mutex_lock(&ftrace_regex_lock);
-
-	ret = -ENODEV;
-	if (unlikely(ftrace_disabled))
-		goto out_unlock;
-
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
 		iter = m->private;
 	} else
 		iter = file->private_data;
 
+	mutex_lock(&iter->ops->regex_lock);
+
+	ret = -ENODEV;
+	if (unlikely(ftrace_disabled))
+		goto out_unlock;
+
 	parser = &iter->parser;
 	read = trace_get_user(parser, ubuf, cnt, ppos);
 
@@ -3282,7 +3304,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 
 	ret = read;
 out_unlock:
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&iter->ops->regex_lock);
 
 	return ret;
 }
@@ -3344,7 +3366,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	if (!hash)
 		return -ENOMEM;
 
-	mutex_lock(&ftrace_regex_lock);
+	mutex_lock(&ops->regex_lock);
 	if (reset)
 		ftrace_filter_reset(hash);
 	if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3366,7 +3388,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	mutex_unlock(&ftrace_lock);
 
  out_regex_unlock:
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&ops->regex_lock);
 
 	free_ftrace_hash(hash);
 	return ret;
@@ -3392,6 +3414,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_addr(ops, ip, remove, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
@@ -3416,6 +3439,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
 		       int len, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_regex(ops, buf, len, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3434,6 +3458,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
 			int len, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_regex(ops, buf, len, reset, 0);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
@@ -3524,6 +3549,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
 	char *func;
 
+	ftrace_ops_init(ops);
+
 	while (buf) {
 		func = strsep(&buf, ",");
 		ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -3551,14 +3578,14 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	int filter_hash;
 	int ret;
 
-	mutex_lock(&ftrace_regex_lock);
 	if (file->f_mode & FMODE_READ) {
 		iter = m->private;
-
 		seq_release(inode, file);
 	} else
 		iter = file->private_data;
 
+	mutex_lock(&iter->ops->regex_lock);
+
 	parser = &iter->parser;
 	if (trace_parser_loaded(parser)) {
 		parser->buffer[parser->idx] = 0;
@@ -3587,7 +3614,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	free_ftrace_hash(iter->hash);
 	kfree(iter);
 
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&iter->ops->regex_lock);
 	return 0;
 }
 
@@ -4126,7 +4153,8 @@ void __init ftrace_init(void)
 
 static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(global_ops)
 };
 
 static int __init ftrace_nodyn_init(void)
@@ -4180,8 +4208,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 }
 
 static struct ftrace_ops control_ops = {
-	.func = ftrace_ops_control_func,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+	.func	= ftrace_ops_control_func,
+	.flags	= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(control_ops)
 };
 
 static inline void
@@ -4539,6 +4568,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret = -1;
 
+	ftrace_ops_init(ops);
+
 	mutex_lock(&ftrace_lock);
 
 	ret = __register_ftrace_function(ops);
-- 
cgit v1.2.3


From 3f2367ba7cbf13ec0f3f1e93b833a7eacd1ab4b8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:21 +0900
Subject: ftrace: Cleanup regex_lock and ftrace_lock around hash updating

Cleanup regex_lock and ftrace_lock locking points around
ftrace_ops hash update code.

The new rule is that regex_lock protects ops->*_hash
read-update-write code for each ftrace_ops. Usually,
hash update is done by following sequence.

1. allocate a new local hash and copy the original hash.
2. update the local hash.
3. move(actually, copy) back the local hash to ftrace_ops.
4. update ftrace entries if needed.
5. release the local hash.

This makes regex_lock protect #1-#4, and ftrace_lock
to protect #3, #4 and adding and removing ftrace_ops from the
ftrace_ops_list. The ftrace_lock protects #3 as well because
the move functions update the entries too.

Link: http://lkml.kernel.org/r/20130509054421.30398.83411.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 59 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 827f2fe7bc3f..cacf0856191e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2656,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		return -ENOMEM;
 	}
 
+	iter->ops = ops;
+	iter->flags = flag;
+
+	mutex_lock(&ops->regex_lock);
+
 	if (flag & FTRACE_ITER_NOTRACE)
 		hash = ops->notrace_hash;
 	else
 		hash = ops->filter_hash;
 
-	iter->ops = ops;
-	iter->flags = flag;
-
 	if (file->f_mode & FMODE_WRITE) {
-		mutex_lock(&ftrace_lock);
 		iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
-		mutex_unlock(&ftrace_lock);
-
 		if (!iter->hash) {
 			trace_parser_put(&iter->parser);
 			kfree(iter);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out_unlock;
 		}
 	}
 
-	mutex_lock(&ops->regex_lock);
-
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
 		ftrace_filter_reset(iter->hash);
@@ -2697,6 +2695,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		}
 	} else
 		file->private_data = iter;
+
+ out_unlock:
 	mutex_unlock(&ops->regex_lock);
 
 	return ret;
@@ -3012,7 +3012,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	if (WARN_ON(not))
 		return -EINVAL;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock(&trace_probe_ops.regex_lock);
 
 	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
 	if (!hash) {
@@ -3070,14 +3070,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	} while_for_each_ftrace_rec();
 
+	mutex_lock(&ftrace_lock);
 	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
 	if (ret < 0)
 		count = ret;
 
 	__enable_ftrace_function_probe();
+	mutex_unlock(&ftrace_lock);
 
  out_unlock:
-	mutex_unlock(&ftrace_lock);
+	mutex_unlock(&trace_probe_ops.regex_lock);
 	free_ftrace_hash(hash);
 
 	return count;
@@ -3117,7 +3119,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			return;
 	}
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock(&trace_probe_ops.regex_lock);
 
 	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
 	if (!hash)
@@ -3155,6 +3157,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			list_add(&entry->free_list, &free_list);
 		}
 	}
+	mutex_lock(&ftrace_lock);
 	__disable_ftrace_function_probe();
 	/*
 	 * Remove after the disable is called. Otherwise, if the last
@@ -3166,9 +3169,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		list_del(&entry->free_list);
 		ftrace_free_entry(entry);
 	}
+	mutex_unlock(&ftrace_lock);
 		
  out_unlock:
-	mutex_unlock(&ftrace_lock);
+	mutex_unlock(&trace_probe_ops.regex_lock);
 	free_ftrace_hash(hash);
 }
 
@@ -3284,11 +3288,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	} else
 		iter = file->private_data;
 
-	mutex_lock(&iter->ops->regex_lock);
-
-	ret = -ENODEV;
 	if (unlikely(ftrace_disabled))
-		goto out_unlock;
+		return -ENODEV;
+
+	/* iter->hash is a local copy, so we don't need regex_lock */
 
 	parser = &iter->parser;
 	read = trace_get_user(parser, ubuf, cnt, ppos);
@@ -3299,13 +3302,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 					   parser->idx, enable);
 		trace_parser_clear(parser);
 		if (ret < 0)
-			goto out_unlock;
+			goto out;
 	}
 
 	ret = read;
-out_unlock:
-	mutex_unlock(&iter->ops->regex_lock);
-
+ out:
 	return ret;
 }
 
@@ -3357,16 +3358,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
+	mutex_lock(&ops->regex_lock);
+
 	if (enable)
 		orig_hash = &ops->filter_hash;
 	else
 		orig_hash = &ops->notrace_hash;
 
 	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
-	if (!hash)
-		return -ENOMEM;
+	if (!hash) {
+		ret = -ENOMEM;
+		goto out_regex_unlock;
+	}
 
-	mutex_lock(&ops->regex_lock);
 	if (reset)
 		ftrace_filter_reset(hash);
 	if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3584,8 +3588,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	} else
 		iter = file->private_data;
 
-	mutex_lock(&iter->ops->regex_lock);
-
 	parser = &iter->parser;
 	if (trace_parser_loaded(parser)) {
 		parser->buffer[parser->idx] = 0;
@@ -3594,6 +3596,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
 	trace_parser_put(parser);
 
+	mutex_lock(&iter->ops->regex_lock);
+
 	if (file->f_mode & FMODE_WRITE) {
 		filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
 
@@ -3611,10 +3615,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
 		mutex_unlock(&ftrace_lock);
 	}
+
+	mutex_unlock(&iter->ops->regex_lock);
 	free_ftrace_hash(iter->hash);
 	kfree(iter);
 
-	mutex_unlock(&iter->ops->regex_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From cce2c8f26704529f592fc124c7c6ad399940dc5d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:24 +0900
Subject: tracing/kprobes: Fix to increment return event probe hit-count

Fix to increment probe hit-count for function return event.

Link: http://lkml.kernel.org/r/20130509054424.30398.34058.stgit@mhiramat-M0-7522

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1865d5f76538..69286337fd7e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -767,6 +767,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
+	tp->nhit++;
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-- 
cgit v1.2.3


From 30052170dcc256c18a43fb3e76577a67394543f8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:26 +0900
Subject: tracing: Indicate enabled soft-mode in enable file

Indicate enabled soft-mode event as "1*" in "enable" file
for each event, because it can be soft-disabled when disable_event
trigger is hit.

Link: http://lkml.kernel.org/r/20130509054426.30398.28202.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87e826f1c237..915c136d7bd1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -623,6 +623,8 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (file->flags & FTRACE_EVENT_FL_ENABLED) {
 		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
 			buf = "0*\n";
+		else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+			buf = "1*\n";
 		else
 			buf = "1\n";
 	} else
-- 
cgit v1.2.3


From 1cf4c0732db3cd3c49cadbc60ff6bda08604e6fa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:29 +0900
Subject: tracing: Modify soft-mode only if there's no other referrer

Modify soft-mode flag only if no other soft-mode referrer
(currently only the ftrace triggers) by using a reference
counter in each ftrace_event_file.

Without this fix, adding and removing several different
enable/disable_event triggers on the same event clear
soft-mode bit from the ftrace_event_file. This also
happens with a typo of glob on setting triggers.

e.g.

 # echo vfs_symlink:enable_event:net:netif_rx > set_ftrace_filter
 # cat events/net/netif_rx/enable
 0*
 # echo typo_func:enable_event:net:netif_rx > set_ftrace_filter
 # cat events/net/netif_rx/enable
 0
 # cat set_ftrace_filter
 #### all functions enabled ####
 vfs_symlink:enable_event:net:netif_rx:unlimited

As above, we still have a trigger, but soft-mode is gone.

Link: http://lkml.kernel.org/r/20130509054429.30398.7464.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: David Sharp <dhsharp@google.com>
Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 915c136d7bd1..8be1224046f8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -251,7 +251,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 	switch (enable) {
 	case 0:
 		/*
-		 * When soft_disable is set and enable is cleared, we want
+		 * When soft_disable is set and enable is cleared, the sm_ref
+		 * reference counter is decremented. If it reaches 0, we want
 		 * to clear the SOFT_DISABLED flag but leave the event in the
 		 * state that it was. That is, if the event was enabled and
 		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
@@ -263,6 +264,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
 		 */
 		if (soft_disable) {
+			if (atomic_dec_return(&file->sm_ref) > 0)
+				break;
 			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
 			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
 		} else
@@ -291,8 +294,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		 */
 		if (!soft_disable)
 			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
-		else
+		else {
+			if (atomic_inc_return(&file->sm_ref) > 1)
+				break;
 			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+		}
 
 		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
 
@@ -1540,6 +1546,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
 
 	file->event_call = call;
 	file->tr = tr;
+	atomic_set(&file->sm_ref, 0);
 	list_add(&file->list, &tr->events);
 
 	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
@@ -1562,6 +1569,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 
 	file->event_call = call;
 	file->tr = tr;
+	atomic_set(&file->sm_ref, 0);
 	list_add(&file->list, &tr->events);
 
 	return 0;
-- 
cgit v1.2.3


From da511bf33e47ea1f33f4b672f7da166d2a1b8a91 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 15:00:07 -0400
Subject: tracing: Add helper function trace_create_new_event() to remove
 duplicate code

Both __trace_add_new_event() and __trace_early_add_new_event() do
basically the same thing, except that __trace_add_new_event() does
a little more.

Instead of having duplicate code between the two functions, add
a helper function trace_create_new_event() that both can use.
This will help against having bugs fixed in one function but not
the other.

Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8be1224046f8..7a0cf68027cc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1529,6 +1529,24 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
 	return 0;
 }
 
+static struct ftrace_event_file *
+trace_create_new_event(struct ftrace_event_call *call,
+		       struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+
+	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+	if (!file)
+		return NULL;
+
+	file->event_call = call;
+	file->tr = tr;
+	atomic_set(&file->sm_ref, 0);
+	list_add(&file->list, &tr->events);
+
+	return file;
+}
+
 /* Add an event to a trace directory */
 static int
 __trace_add_new_event(struct ftrace_event_call *call,
@@ -1540,15 +1558,10 @@ __trace_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+	file = trace_create_new_event(call, tr);
 	if (!file)
 		return -ENOMEM;
 
-	file->event_call = call;
-	file->tr = tr;
-	atomic_set(&file->sm_ref, 0);
-	list_add(&file->list, &tr->events);
-
 	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
 }
 
@@ -1563,15 +1576,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file;
 
-	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+	file = trace_create_new_event(call, tr);
 	if (!file)
 		return -ENOMEM;
 
-	file->event_call = call;
-	file->tr = tr;
-	atomic_set(&file->sm_ref, 0);
-	list_add(&file->list, &tr->events);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5ae0bf5972b66d35e5674e1b7d855b1e111a68ae Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 18:20:37 -0400
Subject: ftrace: Fix locking in register_ftrace_function_probe()

The iteration of the ftrace function list and the call to
ftrace_match_record() need to be protected by the ftrace_lock.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cacf0856191e..f104c45cbcc1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3017,14 +3017,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
 	if (!hash) {
 		count = -ENOMEM;
-		goto out_unlock;
+		goto out;
 	}
 
 	if (unlikely(ftrace_disabled)) {
 		count = -ENODEV;
-		goto out_unlock;
+		goto out;
 	}
 
+	mutex_lock(&ftrace_lock);
+
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (!ftrace_match_record(rec, NULL, search, len, type))
@@ -3070,15 +3072,15 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	} while_for_each_ftrace_rec();
 
-	mutex_lock(&ftrace_lock);
 	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
 	if (ret < 0)
 		count = ret;
 
 	__enable_ftrace_function_probe();
-	mutex_unlock(&ftrace_lock);
 
  out_unlock:
+	mutex_unlock(&ftrace_lock);
+ out:
 	mutex_unlock(&trace_probe_ops.regex_lock);
 	free_ftrace_hash(hash);
 
-- 
cgit v1.2.3


From 23ea9c4dda129fe1711f9fbda03c7a9c91cf1322 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 19:31:48 -0400
Subject: ftrace: Fix the output of enabled_functions debug file

The enabled_functions debugfs file was created to be able to see
what functions have been modified from nops to calling a tracer.

The current method uses the counter in the function record.
As when a ftrace_ops is registered to a function, its count
increases. But that doesn't mean that the function is actively
being traced. /proc/sys/kernel/ftrace_enabled can be set to zero
which would disable it, as well as something can go wrong and
we can think its enabled when only the counter is set.

The record's FTRACE_FL_ENABLED flag is set or cleared when its
function is modified. That is a much more accurate way of knowing
what function is enabled or not.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f104c45cbcc1..dcca9fa29bf4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2459,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
 
 		    ((iter->flags & FTRACE_ITER_ENABLED) &&
-		     !(rec->flags & ~FTRACE_FL_MASK))) {
+		     !(rec->flags & FTRACE_FL_ENABLED))) {
 
 			rec = NULL;
 			goto retry;
-- 
cgit v1.2.3


From 19dd603e45199d93d61e9853c596d098e04e5d66 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 9 May 2013 19:37:36 -0400
Subject: ftrace: Fix function probe when more than one probe is added

When the first function probe is added and the function tracer
is updated the functions are modified to call the probe.
But when a second function is added, it updates the function
records to have the second function also update, but it fails
to update the actual function itself.

This prevents the second (or third or forth and so on) probes
from having their functions called.

  # echo vfs_symlink:enable_event:sched:sched_switch > set_ftrace_filter
  # echo vfs_unlink:enable_event:sched:sched_switch > set_ftrace_filter
  # cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 0/0   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
  # touch /tmp/a
  # rm /tmp/a
  # cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 0/0   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
  # ln -s /tmp/a
  # cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 414/414   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
           <idle>-0     [000] d..3  2847.923031: sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=2786 next_prio=120
            <...>-3114  [001] d..4  2847.923035: sched_switch: prev_comm=ln prev_pid=3114 prev_prio=120 prev_state=x ==> next_comm=swapper/1 next_pid=0 next_prio=120
             bash-2786  [000] d..3  2847.923535: sched_switch: prev_comm=bash prev_pid=2786 prev_prio=120 prev_state=S ==> next_comm=kworker/0:1 next_pid=34 next_prio=120
      kworker/0:1-34    [000] d..3  2847.923552: sched_switch: prev_comm=kworker/0:1 prev_pid=34 prev_prio=120 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120
           <idle>-0     [002] d..3  2847.923554: sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=sshd next_pid=2783 next_prio=120
             sshd-2783  [002] d..3  2847.923660: sched_switch: prev_comm=sshd prev_pid=2783 prev_prio=120 prev_state=S ==> next_comm=swapper/2 next_pid=0 next_prio=120

Still need to update the functions even though the probe itself
does not need to be registered again when added a new probe.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dcca9fa29bf4..b549b0f5b977 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2941,8 +2941,12 @@ static void __enable_ftrace_function_probe(void)
 	int ret;
 	int i;
 
-	if (ftrace_probe_registered)
+	if (ftrace_probe_registered) {
+		/* still need to update the function call sites */
+		if (ftrace_enabled)
+			ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 		return;
+	}
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
 		struct hlist_head *hhd = &ftrace_func_hash[i];
-- 
cgit v1.2.3


From db02038f4e6a776fd3a0bb71242be37ff378ce86 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:32 +0900
Subject: tracing/kprobes: Use bool for retprobe checker

Use bool instead of int for kretprobe checker.

Link: http://lkml.kernel.org/r/20130509054431.30398.38561.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 69286337fd7e..0b7386a54b1e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -46,7 +46,7 @@ struct trace_probe {
 	(sizeof(struct probe_arg) * (n)))
 
 
-static __kprobes int trace_probe_is_return(struct trace_probe *tp)
+static __kprobes bool trace_probe_is_return(struct trace_probe *tp)
 {
 	return tp->rp.handler != NULL;
 }
-- 
cgit v1.2.3


From 48182bd2261766b810e4e4269a23236c1ace63fb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:36 +0900
Subject: tracing/kprobes: Increment probe hit-count even if it is used by perf

Increment probe hit-count for profiling even if it is used
by perf tool. Same thing has already done in trace_uprobe.

Link: http://lkml.kernel.org/r/20130509054436.30398.21133.stgit@mhiramat-M0-7522

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 0b7386a54b1e..6e86fbbae337 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -733,8 +733,6 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
-	tp->nhit++;
-
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
@@ -767,8 +765,6 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
-	tp->nhit++;
-
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
@@ -1075,6 +1071,8 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 
+	tp->nhit++;
+
 	if (tp->flags & TP_FLAG_TRACE)
 		kprobe_trace_func(kp, regs);
 #ifdef CONFIG_PERF_EVENTS
@@ -1089,6 +1087,8 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 
+	tp->nhit++;
+
 	if (tp->flags & TP_FLAG_TRACE)
 		kretprobe_trace_func(ri, regs);
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From 2b106aabe6c566ba19c352f22683381e1ea41326 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:41 +0900
Subject: tracing/kprobes: Pass trace_probe directly from dispatcher

Pass the pointer of struct trace_probe directly from probe
dispatcher to handlers. This removes redundant container_of
macro uses. Same thing has already done in trace_uprobe.

Link: http://lkml.kernel.org/r/20130509054441.30398.69112.stgit@mhiramat-M0-7522

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6e86fbbae337..9ca44fc3fb0b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -723,9 +723,9 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
 }
 
 /* Kprobe handler */
-static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void
+kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
 {
-	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
@@ -745,7 +745,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->ip = (unsigned long)kp->addr;
+	entry->ip = (unsigned long)tp->rp.kp.addr;
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
@@ -754,10 +754,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 }
 
 /* Kretprobe handler */
-static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
-					  struct pt_regs *regs)
+static __kprobes void
+kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+		     struct pt_regs *regs)
 {
-	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
@@ -973,10 +973,9 @@ static int set_print_fmt(struct trace_probe *tp)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static __kprobes void kprobe_perf_func(struct kprobe *kp,
-					 struct pt_regs *regs)
+static __kprobes void
+kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
 {
-	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -995,7 +994,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	if (!entry)
 		return;
 
-	entry->ip = (unsigned long)kp->addr;
+	entry->ip = (unsigned long)tp->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
@@ -1005,10 +1004,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 }
 
 /* Kretprobe profile handler */
-static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
-					    struct pt_regs *regs)
+static __kprobes void
+kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+		    struct pt_regs *regs)
 {
-	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -1074,10 +1073,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 	tp->nhit++;
 
 	if (tp->flags & TP_FLAG_TRACE)
-		kprobe_trace_func(kp, regs);
+		kprobe_trace_func(tp, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(kp, regs);
+		kprobe_perf_func(tp, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
@@ -1090,10 +1089,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 	tp->nhit++;
 
 	if (tp->flags & TP_FLAG_TRACE)
-		kretprobe_trace_func(ri, regs);
+		kretprobe_trace_func(tp, ri, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kretprobe_perf_func(ri, regs);
+		kretprobe_perf_func(tp, ri, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
-- 
cgit v1.2.3


From 41a7dd420c57323391d58b553318c1fad8e7ebc2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:49 +0900
Subject: tracing/kprobes: Support ftrace_event_file base multibuffer

Support multi-buffer on kprobe-based dynamic events by
using ftrace_event_file.

Link: http://lkml.kernel.org/r/20130509054449.30398.88343.stgit@mhiramat-M0-7522

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 250 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 214 insertions(+), 36 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9ca44fc3fb0b..fee865d8a7c4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,7 +27,6 @@
 /**
  * Kprobe event core functions
  */
-
 struct trace_probe {
 	struct list_head	list;
 	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
@@ -36,6 +35,7 @@ struct trace_probe {
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_class	class;
 	struct ftrace_event_call	call;
+	struct ftrace_event_file	**files;
 	ssize_t			size;		/* trace entry size */
 	unsigned int		nr_args;
 	struct probe_arg	args[];
@@ -183,12 +183,57 @@ static struct trace_probe *find_trace_probe(const char *event,
 	return NULL;
 }
 
-/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static int enable_trace_probe(struct trace_probe *tp, int flag)
+static int trace_probe_nr_files(struct trace_probe *tp)
+{
+	struct ftrace_event_file **file = tp->files;
+	int ret = 0;
+
+	if (file)
+		while (*(file++))
+			ret++;
+
+	return ret;
+}
+
+static DEFINE_MUTEX(probe_enable_lock);
+
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int
+enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 {
 	int ret = 0;
 
-	tp->flags |= flag;
+	mutex_lock(&probe_enable_lock);
+
+	if (file) {
+		struct ftrace_event_file **new, **old = tp->files;
+		int n = trace_probe_nr_files(tp);
+
+		/* 1 is for new one and 1 is for stopper */
+		new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
+			      GFP_KERNEL);
+		if (!new) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		memcpy(new, old, n * sizeof(struct ftrace_event_file *));
+		new[n] = file;
+		/* The last one keeps a NULL */
+
+		rcu_assign_pointer(tp->files, new);
+		tp->flags |= TP_FLAG_TRACE;
+
+		if (old) {
+			/* Make sure the probe is done with old files */
+			synchronize_sched();
+			kfree(old);
+		}
+	} else
+		tp->flags |= TP_FLAG_PROFILE;
+
 	if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
 	    !trace_probe_has_gone(tp)) {
 		if (trace_probe_is_return(tp))
@@ -197,19 +242,83 @@ static int enable_trace_probe(struct trace_probe *tp, int flag)
 			ret = enable_kprobe(&tp->rp.kp);
 	}
 
+ out_unlock:
+	mutex_unlock(&probe_enable_lock);
+
 	return ret;
 }
 
-/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
-static void disable_trace_probe(struct trace_probe *tp, int flag)
+static int
+trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+	int i;
+
+	if (tp->files) {
+		for (i = 0; tp->files[i]; i++)
+			if (tp->files[i] == file)
+				return i;
+	}
+
+	return -1;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int
+disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 {
-	tp->flags &= ~flag;
+	int ret = 0;
+
+	mutex_lock(&probe_enable_lock);
+
+	if (file) {
+		struct ftrace_event_file **new, **old = tp->files;
+		int n = trace_probe_nr_files(tp);
+		int i, j;
+
+		if (n == 0 || trace_probe_file_index(tp, file) < 0) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		if (n == 1) {	/* Remove the last file */
+			tp->flags &= ~TP_FLAG_TRACE;
+			new = NULL;
+		} else {
+			new = kzalloc(n * sizeof(struct ftrace_event_file *),
+				      GFP_KERNEL);
+			if (!new) {
+				ret = -ENOMEM;
+				goto out_unlock;
+			}
+
+			/* This copy & check loop copies the NULL stopper too */
+			for (i = 0, j = 0; j < n && i < n + 1; i++)
+				if (old[i] != file)
+					new[j++] = old[i];
+		}
+
+		rcu_assign_pointer(tp->files, new);
+
+		/* Make sure the probe is done with old files */
+		synchronize_sched();
+		kfree(old);
+	} else
+		tp->flags &= ~TP_FLAG_PROFILE;
+
 	if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
 		if (trace_probe_is_return(tp))
 			disable_kretprobe(&tp->rp);
 		else
 			disable_kprobe(&tp->rp.kp);
 	}
+
+ out_unlock:
+	mutex_unlock(&probe_enable_lock);
+
+	return ret;
 }
 
 /* Internal register function - just handle k*probes and flags */
@@ -724,7 +833,8 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
 
 /* Kprobe handler */
 static __kprobes void
-kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
+__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
+		    struct ftrace_event_file *ftrace_file)
 {
 	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -733,14 +843,17 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
+	WARN_ON(call != ftrace_file->event_call);
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
 	dsize = __get_data_size(tp, regs);
 	size = sizeof(*entry) + tp->size + dsize;
 
-	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, irq_flags, pc);
+	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+						call->event.type,
+						size, irq_flags, pc);
 	if (!event)
 		return;
 
@@ -753,10 +866,23 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
 						irq_flags, pc, regs);
 }
 
+static __kprobes void
+kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
+{
+	struct ftrace_event_file **file = tp->files;
+
+	/* Note: preempt is already disabled around the kprobe handler */
+	while (*file) {
+		__kprobe_trace_func(tp, regs, *file);
+		file++;
+	}
+}
+
 /* Kretprobe handler */
 static __kprobes void
-kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
-		     struct pt_regs *regs)
+__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+		       struct pt_regs *regs,
+		       struct ftrace_event_file *ftrace_file)
 {
 	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -765,14 +891,17 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
+	WARN_ON(call != ftrace_file->event_call);
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
 	dsize = __get_data_size(tp, regs);
 	size = sizeof(*entry) + tp->size + dsize;
 
-	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, irq_flags, pc);
+	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+						call->event.type,
+						size, irq_flags, pc);
 	if (!event)
 		return;
 
@@ -786,6 +915,19 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
 						irq_flags, pc, regs);
 }
 
+static __kprobes void
+kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
+		     struct pt_regs *regs)
+{
+	struct ftrace_event_file **file = tp->files;
+
+	/* Note: preempt is already disabled around the kprobe handler */
+	while (*file) {
+		__kretprobe_trace_func(tp, ri, regs, *file);
+		file++;
+	}
+}
+
 /* Event entry printers */
 enum print_line_t
 print_kprobe_event(struct trace_iterator *iter, int flags,
@@ -1041,20 +1183,19 @@ int kprobe_register(struct ftrace_event_call *event,
 		    enum trace_reg type, void *data)
 {
 	struct trace_probe *tp = (struct trace_probe *)event->data;
+	struct ftrace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return enable_trace_probe(tp, TP_FLAG_TRACE);
+		return enable_trace_probe(tp, file);
 	case TRACE_REG_UNREGISTER:
-		disable_trace_probe(tp, TP_FLAG_TRACE);
-		return 0;
+		return disable_trace_probe(tp, file);
 
 #ifdef CONFIG_PERF_EVENTS
 	case TRACE_REG_PERF_REGISTER:
-		return enable_trace_probe(tp, TP_FLAG_PROFILE);
+		return enable_trace_probe(tp, NULL);
 	case TRACE_REG_PERF_UNREGISTER:
-		disable_trace_probe(tp, TP_FLAG_PROFILE);
-		return 0;
+		return disable_trace_probe(tp, NULL);
 	case TRACE_REG_PERF_OPEN:
 	case TRACE_REG_PERF_CLOSE:
 	case TRACE_REG_PERF_ADD:
@@ -1190,11 +1331,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
 	return a1 + a2 + a3 + a4 + a5 + a6;
 }
 
+static struct ftrace_event_file *
+find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
+{
+	struct ftrace_event_file *file;
+
+	list_for_each_entry(file, &tr->events, list)
+		if (file->event_call == &tp->call)
+			return file;
+
+	return NULL;
+}
+
 static __init int kprobe_trace_self_tests_init(void)
 {
 	int ret, warn = 0;
 	int (*target)(int, int, int, int, int, int);
 	struct trace_probe *tp;
+	struct ftrace_event_file *file;
 
 	target = kprobe_trace_selftest_target;
 
@@ -1204,31 +1358,43 @@ static __init int kprobe_trace_self_tests_init(void)
 				  "$stack $stack0 +0($stack)",
 				  create_trace_probe);
 	if (WARN_ON_ONCE(ret)) {
-		pr_warning("error on probing function entry.\n");
+		pr_warn("error on probing function entry.\n");
 		warn++;
 	} else {
 		/* Enable trace point */
 		tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
 		if (WARN_ON_ONCE(tp == NULL)) {
-			pr_warning("error on getting new probe.\n");
+			pr_warn("error on getting new probe.\n");
 			warn++;
-		} else
-			enable_trace_probe(tp, TP_FLAG_TRACE);
+		} else {
+			file = find_trace_probe_file(tp, top_trace_array());
+			if (WARN_ON_ONCE(file == NULL)) {
+				pr_warn("error on getting probe file.\n");
+				warn++;
+			} else
+				enable_trace_probe(tp, file);
+		}
 	}
 
 	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
 				  "$retval", create_trace_probe);
 	if (WARN_ON_ONCE(ret)) {
-		pr_warning("error on probing function return.\n");
+		pr_warn("error on probing function return.\n");
 		warn++;
 	} else {
 		/* Enable trace point */
 		tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
 		if (WARN_ON_ONCE(tp == NULL)) {
-			pr_warning("error on getting new probe.\n");
+			pr_warn("error on getting 2nd new probe.\n");
 			warn++;
-		} else
-			enable_trace_probe(tp, TP_FLAG_TRACE);
+		} else {
+			file = find_trace_probe_file(tp, top_trace_array());
+			if (WARN_ON_ONCE(file == NULL)) {
+				pr_warn("error on getting probe file.\n");
+				warn++;
+			} else
+				enable_trace_probe(tp, file);
+		}
 	}
 
 	if (warn)
@@ -1239,27 +1405,39 @@ static __init int kprobe_trace_self_tests_init(void)
 	/* Disable trace points before removing it */
 	tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
 	if (WARN_ON_ONCE(tp == NULL)) {
-		pr_warning("error on getting test probe.\n");
+		pr_warn("error on getting test probe.\n");
 		warn++;
-	} else
-		disable_trace_probe(tp, TP_FLAG_TRACE);
+	} else {
+		file = find_trace_probe_file(tp, top_trace_array());
+		if (WARN_ON_ONCE(file == NULL)) {
+			pr_warn("error on getting probe file.\n");
+			warn++;
+		} else
+			disable_trace_probe(tp, file);
+	}
 
 	tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
 	if (WARN_ON_ONCE(tp == NULL)) {
-		pr_warning("error on getting 2nd test probe.\n");
+		pr_warn("error on getting 2nd test probe.\n");
 		warn++;
-	} else
-		disable_trace_probe(tp, TP_FLAG_TRACE);
+	} else {
+		file = find_trace_probe_file(tp, top_trace_array());
+		if (WARN_ON_ONCE(file == NULL)) {
+			pr_warn("error on getting probe file.\n");
+			warn++;
+		} else
+			disable_trace_probe(tp, file);
+	}
 
 	ret = traceprobe_command("-:testprobe", create_trace_probe);
 	if (WARN_ON_ONCE(ret)) {
-		pr_warning("error on deleting a probe.\n");
+		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
 	ret = traceprobe_command("-:testprobe2", create_trace_probe);
 	if (WARN_ON_ONCE(ret)) {
-		pr_warning("error on deleting a probe.\n");
+		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-- 
cgit v1.2.3


From b8820084f2130b3dcfb09c78ac16cdd2194a345b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:54 +0900
Subject: tracing/kprobes: Support soft-mode disabling

Support soft-mode disabling on kprobe-based dynamic events.
Soft-disabling is just ignoring recording if the soft disabled
flag is set.

Link: http://lkml.kernel.org/r/20130509054454.30398.7237.stgit@mhiramat-M0-7522

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index fee865d8a7c4..636d45fe69b3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -845,6 +845,9 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
 
 	WARN_ON(call != ftrace_file->event_call);
 
+	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+		return;
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
@@ -893,6 +896,9 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
 
 	WARN_ON(call != ftrace_file->event_call);
 
+	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+		return;
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-- 
cgit v1.2.3