From 5168ae50a66e3ff7184c2b16d661bd6d70367e50 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Jun 2010 09:36:50 -0400
Subject: tracing: Remove ftrace_preempt_disable/enable

The ftrace_preempt_disable/enable functions were to address a
recursive race caused by the function tracer. The function tracer
traces all functions which makes it easily susceptible to recursion.
One area was preempt_enable(). This would call the scheduler and
the schedulre would call the function tracer and loop.
(So was it thought).

The ftrace_preempt_disable/enable was made to protect against recursion
inside the scheduler by storing the NEED_RESCHED flag. If it was
set before the ftrace_preempt_disable() it would not call schedule
on ftrace_preempt_enable(), thinking that if it was set before then
it would have already scheduled unless it was already in the scheduler.

This worked fine except in the case of SMP, where another task would set
the NEED_RESCHED flag for a task on another CPU, and then kick off an
IPI to trigger it. This could cause the NEED_RESCHED to be saved at
ftrace_preempt_disable() but the IPI to arrive in the the preempt
disabled section. The ftrace_preempt_enable() would not call the scheduler
because the flag was already set before entring the section.

This bug would cause a missed preemption check and cause lower latencies.

Investigating further, I found that the recusion caused by the function
tracer was not due to schedule(), but due to preempt_schedule(). Now
that preempt_schedule is completely annotated with notrace, the recusion
no longer is an issue.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c             |  5 ++--
 kernel/trace/ring_buffer.c        | 38 +++++++------------------------
 kernel/trace/trace.c              |  5 ++--
 kernel/trace/trace.h              | 48 ---------------------------------------
 kernel/trace/trace_clock.c        |  5 ++--
 kernel/trace/trace_events.c       |  5 ++--
 kernel/trace/trace_functions.c    |  6 ++---
 kernel/trace/trace_sched_wakeup.c |  5 ++--
 kernel/trace/trace_stack.c        |  6 ++---
 9 files changed, 24 insertions(+), 99 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
-	int resched;
 
 	key = hash_long(ip, FTRACE_HASH_BITS);
 
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 	 * period. This syncs the hash iteration and freeing of items
 	 * on the hash. rcu_read_lock is too dangerous here.
 	 */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	hlist_for_each_entry_rcu(entry, n, hhd, node) {
 		if (entry->ip == ip)
 			entry->ops->func(ip, parent_ip, &entry->data);
 	}
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f6059c5aa94..c3d3cd9c2a53 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2234,8 +2234,6 @@ static void trace_recursive_unlock(void)
 
 #endif
 
-static DEFINE_PER_CPU(int, rb_need_resched);
-
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2256,13 +2254,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	int cpu, resched;
+	int cpu;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return NULL;
 
 	/* If we are tracing schedule, we don't want to recurse */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	if (atomic_read(&buffer->record_disabled))
 		goto out_nocheck;
@@ -2287,21 +2285,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (!event)
 		goto out;
 
-	/*
-	 * Need to store resched state on this cpu.
-	 * Only the first needs to.
-	 */
-
-	if (preempt_count() == 1)
-		per_cpu(rb_need_resched, cpu) = resched;
-
 	return event;
 
  out:
 	trace_recursive_unlock();
 
  out_nocheck:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2347,13 +2337,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	trace_recursive_unlock();
 
-	/*
-	 * Only the last preempt count needs to restore preemption.
-	 */
-	if (preempt_count() == 1)
-		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-	else
-		preempt_enable_no_resched_notrace();
+	preempt_enable_notrace();
 
 	return 0;
 }
@@ -2461,13 +2445,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 
 	trace_recursive_unlock();
 
-	/*
-	 * Only the last preempt count needs to restore preemption.
-	 */
-	if (preempt_count() == 1)
-		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-	else
-		preempt_enable_no_resched_notrace();
+	preempt_enable_notrace();
 
 }
 EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2493,12 +2471,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	void *body;
 	int ret = -EBUSY;
-	int cpu, resched;
+	int cpu;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return -EBUSY;
 
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	if (atomic_read(&buffer->record_disabled))
 		goto out;
@@ -2528,7 +2506,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	ret = 0;
  out:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55e48511d7c8..35727140f4fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1404,7 +1404,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct bprint_entry *entry;
 	unsigned long flags;
 	int disable;
-	int resched;
 	int cpu, len = 0, size, pc;
 
 	if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1413,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	pause_graph_tracing();
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
@@ -1452,7 +1451,7 @@ out_unlock:
 
 out:
 	atomic_dec_return(&data->disabled);
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 	unpause_graph_tracing();
 
 	return len;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..6c45e55097ce 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -628,54 +628,6 @@ enum trace_iterator_flags {
 
 extern struct tracer nop_trace;
 
-/**
- * ftrace_preempt_disable - disable preemption scheduler safe
- *
- * When tracing can happen inside the scheduler, there exists
- * cases that the tracing might happen before the need_resched
- * flag is checked. If this happens and the tracer calls
- * preempt_enable (after a disable), a schedule might take place
- * causing an infinite recursion.
- *
- * To prevent this, we read the need_resched flag before
- * disabling preemption. When we want to enable preemption we
- * check the flag, if it is set, then we call preempt_enable_no_resched.
- * Otherwise, we call preempt_enable.
- *
- * The rational for doing the above is that if need_resched is set
- * and we have yet to reschedule, we are either in an atomic location
- * (where we do not need to check for scheduling) or we are inside
- * the scheduler and do not want to resched.
- */
-static inline int ftrace_preempt_disable(void)
-{
-	int resched;
-
-	resched = need_resched();
-	preempt_disable_notrace();
-
-	return resched;
-}
-
-/**
- * ftrace_preempt_enable - enable preemption scheduler safe
- * @resched: the return value from ftrace_preempt_disable
- *
- * This is a scheduler safe way to enable preemption and not miss
- * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we are either inside an atomic or
- * are inside the scheduler (we would have already scheduled
- * otherwise). In this case, we do not want to call normal
- * preempt_enable, but preempt_enable_no_resched instead.
- */
-static inline void ftrace_preempt_enable(int resched)
-{
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
-}
-
 #ifdef CONFIG_BRANCH_TRACER
 extern int enable_branch_tracing(struct trace_array *tr);
 extern void disable_branch_tracing(void);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..52fda6c04ac3 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
 u64 notrace trace_clock_local(void)
 {
 	u64 clock;
-	int resched;
 
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	clock = sched_clock();
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 
 	return clock;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..a594f9a7ee3d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1524,12 +1524,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	struct ftrace_entry *entry;
 	unsigned long flags;
 	long disabled;
-	int resched;
 	int cpu;
 	int pc;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
 
@@ -1551,7 +1550,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __initdata  =
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
-	int cpu, resched;
+	int cpu;
 	int pc;
 
 	if (unlikely(!ftrace_function_enabled))
 		return;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	local_save_flags(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 		trace_function(tr, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..c9fd5bd02036 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
-	int resched;
 	int cpu;
 	int pc;
 
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  out:
 	atomic_dec(&data->disabled);
  out_enable:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..056468eae7cf 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
 static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
-	int cpu, resched;
+	int cpu;
 
 	if (unlikely(!ftrace_enabled || stack_trace_disabled))
 		return;
 
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	/* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  out:
 	per_cpu(trace_active, cpu)--;
 	/* prevent recursion in schedule */
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
-- 
cgit v1.2.3


From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 19 May 2010 21:35:17 +0200
Subject: x86: Unify dumpstack.h and stacktrace.h

arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h
declare headers of objects that deal with the same topic.
Actually most of the files that include stacktrace.h also include
dumpstack.h

Although dumpstack.h seems more reserved for internals of stack
traces, those are quite often needed to define specialized stack
trace operations. And perf event arch headers are going to need
access to such low level operations anyway. So don't continue to
bother with dumpstack.h as it's not anymore about isolated deep
internals.

v2: fix struct stack_frame definition conflict in sysprof

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Soeren Sandmann <sandmann@daimi.au.dk>
---
 kernel/trace/trace_sysprof.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index a7974a552ca9..c080956f4d8e 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -33,12 +33,13 @@ static DEFINE_MUTEX(sample_timer_lock);
  */
 static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
 
-struct stack_frame {
+struct stack_frame_user {
 	const void __user	*next_fp;
 	unsigned long		return_address;
 };
 
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -125,7 +126,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr,
 static void timer_notify(struct pt_regs *regs, int cpu)
 {
 	struct trace_array_cpu *data;
-	struct stack_frame frame;
+	struct stack_frame_user frame;
 	struct trace_array *tr;
 	const void __user *fp;
 	int is_user;
-- 
cgit v1.2.3


From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 20 May 2010 07:47:21 +0200
Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller

Drop this argument now that we always want to rewind only to the
state of the first caller.
It means frame pointers are not necessary anymore to reliably get
the source of an event. But this also means we need this helper
to be a macro now, as an inline function is not an option since
we need to know when to provide a default implentation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_event_perf.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e4..21db1d3a48d0 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
 static char *perf_trace_buf[4];
 
 /*
-- 
cgit v1.2.3


From 30dbb20e68e6f7df974b77d2350ebad5eb6f6c9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Am=C3=A9rico=20Wang?= <xiyou.wangcong@gmail.com>
Date: Wed, 26 May 2010 18:57:53 +0800
Subject: tracing: Remove boot tracer

The boot tracer is useless. It simply logs the initcalls
but in fact these initcalls are also logged through printk
while using the initcall_debug kernel parameter.

Nobody seem to be using it so far. Then just remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Chase Douglas <chase.douglas@canonical.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <20100526105753.GA5677@cr0.nay.redhat.com>
[ remove the hooks in main.c, and the headers ]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig         |  17 ----
 kernel/trace/Makefile        |   1 -
 kernel/trace/trace.c         |   3 -
 kernel/trace/trace.h         |   8 --
 kernel/trace/trace_boot.c    | 185 -------------------------------------------
 kernel/trace/trace_entries.h |  27 -------
 6 files changed, 241 deletions(-)
 delete mode 100644 kernel/trace/trace_boot.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..572992abc71c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -229,23 +229,6 @@ config FTRACE_SYSCALLS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
-config BOOT_TRACER
-	bool "Trace boot initcalls"
-	select GENERIC_TRACER
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer helps developers to optimize boot times: it records
-	  the timings of the initcalls and traces key events and the identity
-	  of tasks that can cause boot delays, such as context-switches.
-
-	  Its aim is to be parsed by the scripts/bootgraph.pl tool to
-	  produce pretty graphics about boot inefficiencies, giving a visual
-	  representation of the delays during initcalls - but the raw
-	  /debug/tracing/trace text output is readable too.
-
-	  You must pass in initcall_debug and ftrace=initcall to the kernel
-	  command line to enable this on bootup.
-
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..c3aaeba82372 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
-obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55e48511d7c8..036fbc22858b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4603,9 +4603,6 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 	current_trace = &nop_trace;
-#ifdef CONFIG_BOOT_TRACER
-	register_tracer(&boot_tracer);
-#endif
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..75a5e800a737 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,8 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <trace/boot.h>
 #include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
-
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
@@ -29,8 +27,6 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BRANCH,
-	TRACE_BOOT_CALL,
-	TRACE_BOOT_RET,
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
@@ -48,8 +44,6 @@ enum kmemtrace_type_id {
 	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
 };
 
-extern struct tracer boot_tracer;
-
 #undef __field
 #define __field(type, item)		type	item;
 
@@ -209,8 +203,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
-		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
-		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
 			  TRACE_GRAPH_ENT);		\
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * ring buffer based initcalls tracer
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/time.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *boot_trace;
-static bool pre_initcalls_finished;
-
-/* Tells the boot tracer that the pre_smp_initcalls are finished.
- * So we are ready .
- * It doesn't enable sched events tracing however.
- * You have to call enable_boot_trace to do so.
- */
-void start_boot_trace(void)
-{
-	pre_initcalls_finished = true;
-}
-
-void enable_boot_trace(void)
-{
-	if (boot_trace && pre_initcalls_finished)
-		tracing_start_sched_switch_record();
-}
-
-void disable_boot_trace(void)
-{
-	if (boot_trace && pre_initcalls_finished)
-		tracing_stop_sched_switch_record();
-}
-
-static int boot_trace_init(struct trace_array *tr)
-{
-	boot_trace = tr;
-
-	if (!tr)
-		return 0;
-
-	tracing_reset_online_cpus(tr);
-
-	tracing_sched_switch_assign_trace(tr);
-	return 0;
-}
-
-static enum print_line_t
-initcall_call_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct trace_boot_call *field;
-	struct boot_trace_call *call;
-	u64 ts;
-	unsigned long nsec_rem;
-	int ret;
-
-	trace_assign_type(field, entry);
-	call = &field->boot_call;
-	ts = iter->ts;
-	nsec_rem = do_div(ts, NSEC_PER_SEC);
-
-	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-			(unsigned long)ts, nsec_rem, call->func, call->caller);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	else
-		return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-initcall_ret_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct trace_boot_ret *field;
-	struct boot_trace_ret *init_ret;
-	u64 ts;
-	unsigned long nsec_rem;
-	int ret;
-
-	trace_assign_type(field, entry);
-	init_ret = &field->boot_ret;
-	ts = iter->ts;
-	nsec_rem = do_div(ts, NSEC_PER_SEC);
-
-	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-			"returned %d after %llu msecs\n",
-			(unsigned long) ts,
-			nsec_rem,
-			init_ret->func, init_ret->result, init_ret->duration);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	else
-		return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	switch (entry->type) {
-	case TRACE_BOOT_CALL:
-		return initcall_call_print_line(iter);
-	case TRACE_BOOT_RET:
-		return initcall_ret_print_line(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-struct tracer boot_tracer __read_mostly =
-{
-	.name		= "initcall",
-	.init		= boot_trace_init,
-	.reset		= tracing_reset_online_cpus,
-	.print_line	= initcall_print_line,
-};
-
-void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
-{
-	struct ftrace_event_call *call = &event_boot_call;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_boot_call *entry;
-	struct trace_array *tr = boot_trace;
-
-	if (!tr || !pre_initcalls_finished)
-		return;
-
-	/* Get its name now since this function could
-	 * disappear because it is in the .init section.
-	 */
-	sprint_symbol(bt->func, (unsigned long)fn);
-	preempt_disable();
-
-	buffer = tr->buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->boot_call = *bt;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
-{
-	struct ftrace_event_call *call = &event_boot_ret;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_boot_ret *entry;
-	struct trace_array *tr = boot_trace;
-
-	if (!tr || !pre_initcalls_finished)
-		return;
-
-	sprint_symbol(bt->func, (unsigned long)fn);
-	preempt_disable();
-
-	buffer = tr->buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->boot_ret = *bt;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..c293364c984f 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -271,33 +271,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
 		 __entry->map_id, __entry->opcode)
 );
 
-FTRACE_ENTRY(boot_call, trace_boot_call,
-
-	TRACE_BOOT_CALL,
-
-	F_STRUCT(
-		__field_struct(	struct boot_trace_call,	boot_call	)
-		__field_desc(	pid_t,	boot_call,	caller		)
-		__array_desc(	char,	boot_call,	func,	KSYM_SYMBOL_LEN)
-	),
-
-	F_printk("%d  %s", __entry->caller, __entry->func)
-);
-
-FTRACE_ENTRY(boot_ret, trace_boot_ret,
-
-	TRACE_BOOT_RET,
-
-	F_STRUCT(
-		__field_struct(	struct boot_trace_ret,	boot_ret	)
-		__array_desc(	char,	boot_ret,	func,	KSYM_SYMBOL_LEN)
-		__field_desc(	int,	boot_ret,	result		)
-		__field_desc(	unsigned long, boot_ret, duration	)
-	),
-
-	F_printk("%s %d %lx",
-		 __entry->func, __entry->result, __entry->duration)
-);
 
 #define TRACE_FUNC_SIZE 30
 #define TRACE_FILE_SIZE 20
-- 
cgit v1.2.3


From c676329abb2b8359d9a5d734dec0c81779823fd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 10:48:51 +0200
Subject: sched_clock: Add local_clock() API and improve documentation

For people who otherwise get to write: cpu_clock(smp_processor_id()),
there is now: local_clock().

Also, as per suggestion from Andrew, provide some documentation on
the various clock interfaces, and minimize the unsigned long long vs
u64 mess.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
LKML-Reference: <1275052414.1645.52.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..1723e2b8c589 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,7 +56,7 @@ u64 notrace trace_clock_local(void)
  */
 u64 notrace trace_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 
-- 
cgit v1.2.3


From 039ca4e74a1cf60bd7487324a564ecf5c981f254 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 26 May 2010 17:22:17 +0800
Subject: tracing: Remove kmemtrace ftrace plugin

We have been resisting new ftrace plugins and removing existing
ones, and kmemtrace has been superseded by kmem trace events
and perf-kmem, so we remove it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
[ remove kmemtrace from the makefile, handle slob too ]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig         |  20 --
 kernel/trace/Makefile        |   1 -
 kernel/trace/kmemtrace.c     | 529 -------------------------------------------
 kernel/trace/trace.h         |  12 -
 kernel/trace/trace_entries.h |  35 ---
 5 files changed, 597 deletions(-)
 delete mode 100644 kernel/trace/kmemtrace.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 572992abc71c..f669092fdead 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -354,26 +354,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config KMEMTRACE
-	bool "Trace SLAB allocations"
-	select GENERIC_TRACER
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/trace/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3aaeba82372..469a1c7555a5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL	0x1
-
-static struct tracer_opt kmem_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-	{ }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
-	.val			= 0,
-	.opts			= kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	struct ftrace_event_call *call = &event_kmem_alloc;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_alloc_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-
-	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_ALLOC;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-	entry->bytes_req	= bytes_req;
-	entry->bytes_alloc	= bytes_alloc;
-	entry->gfp_flags	= gfp_flags;
-	entry->node		= node;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-				  unsigned long call_site,
-				  const void *ptr)
-{
-	struct ftrace_event_call *call = &event_kmem_free;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_free_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_FREE;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(void *ignore,
-			      unsigned long call_site,
-			      const void *ptr,
-			      size_t bytes_req,
-			      size_t bytes_alloc,
-			      gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(void *ignore,
-				       unsigned long call_site,
-				       const void *ptr,
-				       size_t bytes_req,
-				       size_t bytes_alloc,
-				       gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(void *ignore,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(void *ignore,
-					    unsigned long call_site,
-					    const void *ptr,
-					    size_t bytes_req,
-					    size_t bytes_alloc,
-					    gfp_t gfp_flags,
-					    int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void
-kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(void *ignore,
-				      unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kfree(kmemtrace_kfree, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	unregister_trace_kfree(kmemtrace_kfree, NULL);
-	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
-	kmemtrace_array = tr;
-
-	tracing_reset_online_cpus(tr);
-
-	kmemtrace_start_probes();
-
-	return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
-	kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
-	/* Don't need headers for the original kmemtrace output */
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return;
-
-	seq_printf(s, "#\n");
-	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-			"      POINTER         NODE    CALLER\n");
-	seq_printf(s, "# FREE   |      |     |       |       "
-			"       |   |            |        |\n");
-	seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC	0
-#define KMEMTRACE_USER_FREE	1
-
-struct kmemtrace_user_event {
-	u8			event_id;
-	u8			type_id;
-	u16			event_size;
-	u32			cpu;
-	u64			timestamp;
-	unsigned long		call_site;
-	unsigned long		ptr;
-};
-
-struct kmemtrace_user_event_alloc {
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	unsigned		gfp_flags;
-	int			node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
-		      struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-	    "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	    entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-	    (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-	    (unsigned long)entry->gfp_flags, entry->node);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags,
-		     struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-			       entry->type_id, (void *)entry->call_site,
-			       (unsigned long)entry->ptr);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
-			   struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_ALLOC;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-	if (!ev_alloc)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev_alloc->bytes_req	= entry->bytes_req;
-	ev_alloc->bytes_alloc	= entry->bytes_alloc;
-	ev_alloc->gfp_flags	= entry->gfp_flags;
-	ev_alloc->node		= entry->node;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
-			  struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	struct kmemtrace_user_event *ev;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_FREE;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Alloc entry */
-	ret = trace_seq_printf(s, "  +      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K   ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C   ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P   ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?   ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Requested */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Allocated */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Flags
-	 * TODO: would be better to see the name of the GFP flag names
-	 */
-	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Node and call site*/
-	ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-						 (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_free_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Free entry */
-	ret = trace_seq_printf(s, "  -      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K     ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C     ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P     ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?     ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip requested/allocated/flags */
-	ret = trace_seq_printf(s, "                       ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip node and print call site*/
-	ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return TRACE_TYPE_UNHANDLED;
-
-	switch (entry->type) {
-	case TRACE_KMEM_ALLOC:
-		return kmemtrace_print_alloc_compress(iter);
-	case TRACE_KMEM_FREE:
-		return kmemtrace_print_free_compress(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-static struct trace_event_functions kmem_trace_alloc_funcs = {
-	.trace			= kmemtrace_print_alloc,
-	.binary			= kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_alloc = {
-	.type			= TRACE_KMEM_ALLOC,
-	.funcs			= &kmem_trace_alloc_funcs,
-};
-
-static struct trace_event_functions kmem_trace_free_funcs = {
-	.trace			= kmemtrace_print_free,
-	.binary			= kmemtrace_print_free_user,
-};
-
-static struct trace_event kmem_trace_free = {
-	.type			= TRACE_KMEM_FREE,
-	.funcs			= &kmem_trace_free_funcs,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
-	.name			= "kmemtrace",
-	.init			= kmem_trace_init,
-	.reset			= kmem_trace_reset,
-	.print_line		= kmemtrace_print_line,
-	.print_header		= kmemtrace_headers,
-	.flags			= &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
-	/* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
-	if (!register_ftrace_event(&kmem_trace_alloc)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (!register_ftrace_event(&kmem_trace_free)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (register_tracer(&kmem_tracer) != 0) {
-		pr_warning("Warning: could not register the kmem tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 75a5e800a737..075cd2ea84a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,6 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -30,19 +29,12 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_KMEM_ALLOC,
-	TRACE_KMEM_FREE,
 	TRACE_BLK,
 	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #undef __field
 #define __field(type, item)		type	item;
@@ -208,10 +200,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
-			  TRACE_KMEM_ALLOC);	\
-		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
-			  TRACE_KMEM_FREE);	\
 		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c293364c984f..13abc157dbaf 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -291,41 +291,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
-	TRACE_KMEM_ALLOC,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-		__field(	size_t,			bytes_req	)
-		__field(	size_t,			bytes_alloc	)
-		__field(	gfp_t,			gfp_flags	)
-		__field(	int,			node		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-		 " flags:%x node:%d",
-		 __entry->type_id, __entry->call_site, __entry->ptr,
-		 __entry->bytes_req, __entry->bytes_alloc,
-		 __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
-	TRACE_KMEM_FREE,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p",
-		 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
 FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 
 	TRACE_KSYM,
-- 
cgit v1.2.3


From c9642c49aae1272d7c24008a40ae614470b957a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:22:30 +0800
Subject: tracing: Use a global field list for all syscall exit events

All syscall exit events have the same fields.

The kernel size drops 2.5K:

   text    data     bss     dec     hex filename
7018612 2034376 7251132 16304120         f8c7f8 vmlinux.o.orig
7018612 2031888 7251132 16301632         f8be40 vmlinux.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA3746.8070100@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
 
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 static struct list_head *
 syscall_get_exit_fields(struct ftrace_event_call *call)
 {
-	struct syscall_metadata *entry = call->data;
-
-	return &entry->exit_fields;
+	return &syscall_exit_fields;
 }
 
 struct trace_event_functions enter_syscall_print_funcs = {
-- 
cgit v1.2.3


From 8728fe501ed562c1b46dde3c195eadec77bca033 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:22:49 +0800
Subject: tracing: Don't allocate common fields for every trace events

Every event has the same common fields, so it's a big waste of
memory to have a copy of those fields for every event.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA3759.30105@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               |   2 +
 kernel/trace/trace_events.c        | 113 +++++++++++++++++++++----------------
 kernel/trace/trace_events_filter.c |  18 +++++-
 3 files changed, 81 insertions(+), 52 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 01ce088c1cdf..cc90ccdad469 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -698,6 +698,8 @@ struct filter_pred {
 	int 			pop_n;
 };
 
+extern struct list_head ftrace_common_fields;
+
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a594f9a7ee3d..d3b4bdf00b39 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
 DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
+LIST_HEAD(ftrace_common_fields);
 
 struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
 	return event_call->class->get_fields(event_call);
 }
 
-int trace_define_field(struct ftrace_event_call *call, const char *type,
-		       const char *name, int offset, int size, int is_signed,
-		       int filter_type)
+static int __trace_define_field(struct list_head *head, const char *type,
+				const char *name, int offset, int size,
+				int is_signed, int filter_type)
 {
 	struct ftrace_event_field *field;
-	struct list_head *head;
-
-	if (WARN_ON(!call->class))
-		return 0;
 
 	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
 	field->size = size;
 	field->is_signed = is_signed;
 
-	head = trace_get_fields(call);
 	list_add(&field->link, head);
 
 	return 0;
@@ -80,17 +76,32 @@ err:
 
 	return -ENOMEM;
 }
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+		       const char *name, int offset, int size, int is_signed,
+		       int filter_type)
+{
+	struct list_head *head;
+
+	if (WARN_ON(!call->class))
+		return 0;
+
+	head = trace_get_fields(call);
+	return __trace_define_field(head, type, name, offset, size,
+				    is_signed, filter_type);
+}
 EXPORT_SYMBOL_GPL(trace_define_field);
 
 #define __common_field(type, item)					\
-	ret = trace_define_field(call, #type, "common_" #item,		\
-				 offsetof(typeof(ent), item),		\
-				 sizeof(ent.item),			\
-				 is_signed_type(type), FILTER_OTHER);	\
+	ret = __trace_define_field(&ftrace_common_fields, #type,	\
+				   "common_" #item,			\
+				   offsetof(typeof(ent), item),		\
+				   sizeof(ent.item),			\
+				   is_signed_type(type), FILTER_OTHER);	\
 	if (ret)							\
 		return ret;
 
-static int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(void)
 {
 	int ret;
 	struct trace_entry ent;
@@ -544,32 +555,10 @@ out:
 	return ret;
 }
 
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
-		  loff_t *ppos)
+static void print_event_fields(struct trace_seq *s, struct list_head *head)
 {
-	struct ftrace_event_call *call = filp->private_data;
 	struct ftrace_event_field *field;
-	struct list_head *head;
-	struct trace_seq *s;
-	int common_field_count = 5;
-	char *buf;
-	int r = 0;
-
-	if (*ppos)
-		return 0;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-
-	trace_seq_init(s);
 
-	trace_seq_printf(s, "name: %s\n", call->name);
-	trace_seq_printf(s, "ID: %d\n", call->event.type);
-	trace_seq_printf(s, "format:\n");
-
-	head = trace_get_fields(call);
 	list_for_each_entry_reverse(field, head, link) {
 		/*
 		 * Smartly shows the array type(except dynamic array).
@@ -584,29 +573,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 			array_descriptor = NULL;
 
 		if (!array_descriptor) {
-			r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+			trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
 					"\tsize:%u;\tsigned:%d;\n",
 					field->type, field->name, field->offset,
 					field->size, !!field->is_signed);
 		} else {
-			r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+			trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
 					"\tsize:%u;\tsigned:%d;\n",
 					(int)(array_descriptor - field->type),
 					field->type, field->name,
 					array_descriptor, field->offset,
 					field->size, !!field->is_signed);
 		}
+	}
+}
 
-		if (--common_field_count == 0)
-			r = trace_seq_printf(s, "\n");
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct list_head *head;
+	struct trace_seq *s;
+	char *buf;
+	int r;
 
-		if (!r)
-			break;
-	}
+	if (*ppos)
+		return 0;
 
-	if (r)
-		r = trace_seq_printf(s, "\nprint fmt: %s\n",
-				call->print_fmt);
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	trace_seq_printf(s, "name: %s\n", call->name);
+	trace_seq_printf(s, "ID: %d\n", call->event.type);
+	trace_seq_printf(s, "format:\n");
+
+	/* print common fields */
+	print_event_fields(s, &ftrace_common_fields);
+
+	trace_seq_putc(s, '\n');
+
+	/* print event specific fields */
+	head = trace_get_fields(call);
+	print_event_fields(s, head);
+
+	r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt);
 
 	if (!r) {
 		/*
@@ -980,9 +994,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		 */
 		head = trace_get_fields(call);
 		if (list_empty(head)) {
-			ret = trace_define_common_fields(call);
-			if (!ret)
-				ret = call->class->define_fields(call);
+			ret = call->class->define_fields(call);
 			if (ret < 0) {
 				pr_warning("Could not initialize trace point"
 					   " events/%s\n", call->name);
@@ -1319,6 +1331,9 @@ static __init int event_trace_init(void)
 	trace_create_file("enable", 0644, d_events,
 			  NULL, &ftrace_system_enable_fops);
 
+	if (trace_define_common_fields())
+		pr_warning("tracing: Failed to allocate common fields");
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..330fefd1de1c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 }
 
 static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
+__find_event_field(struct list_head *head, char *name)
 {
 	struct ftrace_event_field *field;
-	struct list_head *head;
 
-	head = trace_get_fields(call);
 	list_for_each_entry(field, head, link) {
 		if (!strcmp(field->name, name))
 			return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *head;
+
+	field = __find_event_field(&ftrace_common_fields, name);
+	if (field)
+		return field;
+
+	head = trace_get_fields(call);
+	return __find_event_field(head, name);
+}
+
 static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
-- 
cgit v1.2.3


From c9d932cf8a1c608b676021aef0189376ba6ef151 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:24:28 +0800
Subject: tracing: Remove test of NULL define_fields callback

Every event (or event class) has it's define_fields callback,
so the test is redundant.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA37BC.8080707@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c        | 28 +++++++++++++---------------
 kernel/trace/trace_events_filter.c |  9 ---------
 2 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d3b4bdf00b39..5bad9cbbf974 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -987,23 +987,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		 		  id);
 #endif
 
-	if (call->class->define_fields) {
-		/*
-		 * Other events may have the same class. Only update
-		 * the fields if they are not already defined.
-		 */
-		head = trace_get_fields(call);
-		if (list_empty(head)) {
-			ret = call->class->define_fields(call);
-			if (ret < 0) {
-				pr_warning("Could not initialize trace point"
-					   " events/%s\n", call->name);
-				return ret;
-			}
+	/*
+	 * Other events may have the same class. Only update
+	 * the fields if they are not already defined.
+	 */
+	head = trace_get_fields(call);
+	if (list_empty(head)) {
+		ret = call->class->define_fields(call);
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
 		}
-		trace_create_file("filter", 0644, call->dir, call,
-				  filter);
 	}
+	trace_create_file("filter", 0644, call->dir, call,
+			  filter);
 
 	trace_create_file("format", 0444, call->dir, call,
 			  format);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 330fefd1de1c..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -639,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
 	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->class || !call->class->define_fields)
-			continue;
-
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
@@ -658,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 	struct ftrace_event_call *call;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->class || !call->class->define_fields)
-			continue;
-
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
@@ -1263,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
 	list_for_each_entry(call, &ftrace_events, list) {
 		struct event_filter *filter = call->filter;
 
-		if (!call->class || !call->class->define_fields)
-			continue;
-
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-- 
cgit v1.2.3


From ffb9f99528574ab9a55d4c8fd22e9d3ca49efa86 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:24:52 +0800
Subject: tracing: Remove redundant raw_init callbacks

raw_init callback is optional.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA37D4.7070500@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c |  8 +-------
 kernel/trace/trace_kprobe.c | 10 +---------
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 
 #include "trace_entries.h"
 
-static int ftrace_raw_init_event(struct ftrace_event_call *call)
-{
-	INIT_LIST_HEAD(&call->class->fields);
-	return 0;
-}
-
 #undef __entry
 #define __entry REC
 
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 struct ftrace_event_class event_class_ftrace_##call = {			\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.define_fields		= ftrace_define_fields_##call,		\
-	.raw_init		= ftrace_raw_init_event,		\
+	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 };									\
 									\
 struct ftrace_event_call __used						\
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..3b831d8e201e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1214,11 +1214,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
 	}
 }
 
-static int probe_event_raw_init(struct ftrace_event_call *event_call)
-{
-	return 0;
-}
-
 #undef DEFINE_FIELD
 #define DEFINE_FIELD(type, item, name, is_signed)			\
 	do {								\
@@ -1486,15 +1481,12 @@ static int register_probe_event(struct trace_probe *tp)
 	int ret;
 
 	/* Initialize ftrace_event_call */
+	INIT_LIST_HEAD(&call->class->fields);
 	if (probe_is_return(tp)) {
-		INIT_LIST_HEAD(&call->class->fields);
 		call->event.funcs = &kretprobe_funcs;
-		call->class->raw_init = probe_event_raw_init;
 		call->class->define_fields = kretprobe_event_define_fields;
 	} else {
-		INIT_LIST_HEAD(&call->class->fields);
 		call->event.funcs = &kprobe_funcs;
-		call->class->raw_init = probe_event_raw_init;
 		call->class->define_fields = kprobe_event_define_fields;
 	}
 	if (set_print_fmt(tp) < 0)
-- 
cgit v1.2.3


From 67ead0a6ceb001b4cb891d782e440f0e79493ba2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:25:13 +0800
Subject: tracing: Remove open-coded __trace_add_event_call()

Let trace_module_add_events() and event_trace_init() call
__trace_add_event_call().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA37E9.1020106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 70 ++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 51 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5bad9cbbf974..69bee4cc0e10 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1009,11 +1009,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 	return 0;
 }
 
-static int __trace_add_event_call(struct ftrace_event_call *call)
+static int
+__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+		       const struct file_operations *id,
+		       const struct file_operations *enable,
+		       const struct file_operations *filter,
+		       const struct file_operations *format)
 {
 	struct dentry *d_events;
 	int ret;
 
+	/* The linker may leave blanks */
 	if (!call->name)
 		return -EINVAL;
 
@@ -1021,8 +1027,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
 		ret = call->class->raw_init(call);
 		if (ret < 0) {
 			if (ret != -ENOSYS)
-				pr_warning("Could not initialize trace "
-				"events/%s\n", call->name);
+				pr_warning("Could not initialize trace events/%s\n",
+					   call->name);
 			return ret;
 		}
 	}
@@ -1031,11 +1037,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
 	if (!d_events)
 		return -ENOENT;
 
-	ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-				&ftrace_enable_fops, &ftrace_event_filter_fops,
-				&ftrace_event_format_fops);
+	ret = event_create_dir(call, d_events, id, enable, filter, format);
 	if (!ret)
 		list_add(&call->list, &ftrace_events);
+	call->mod = mod;
 
 	return ret;
 }
@@ -1045,7 +1050,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
 {
 	int ret;
 	mutex_lock(&event_mutex);
-	ret = __trace_add_event_call(call);
+	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+				     &ftrace_enable_fops,
+				     &ftrace_event_filter_fops,
+				     &ftrace_event_format_fops);
 	mutex_unlock(&event_mutex);
 	return ret;
 }
@@ -1162,8 +1170,6 @@ static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops = NULL;
 	struct ftrace_event_call *call, *start, *end;
-	struct dentry *d_events;
-	int ret;
 
 	start = mod->trace_events;
 	end = mod->trace_events + mod->num_trace_events;
@@ -1171,38 +1177,14 @@ static void trace_module_add_events(struct module *mod)
 	if (start == end)
 		return;
 
-	d_events = event_trace_events_dir();
-	if (!d_events)
+	file_ops = trace_create_file_ops(mod);
+	if (!file_ops)
 		return;
 
 	for_each_event(call, start, end) {
-		/* The linker may leave blanks */
-		if (!call->name)
-			continue;
-		if (call->class->raw_init) {
-			ret = call->class->raw_init(call);
-			if (ret < 0) {
-				if (ret != -ENOSYS)
-					pr_warning("Could not initialize trace "
-					"point events/%s\n", call->name);
-				continue;
-			}
-		}
-		/*
-		 * This module has events, create file ops for this module
-		 * if not already done.
-		 */
-		if (!file_ops) {
-			file_ops = trace_create_file_ops(mod);
-			if (!file_ops)
-				return;
-		}
-		call->mod = mod;
-		ret = event_create_dir(call, d_events,
+		__trace_add_event_call(call, mod,
 				       &file_ops->id, &file_ops->enable,
 				       &file_ops->filter, &file_ops->format);
-		if (!ret)
-			list_add(&call->list, &ftrace_events);
 	}
 }
 
@@ -1333,24 +1315,10 @@ static __init int event_trace_init(void)
 		pr_warning("tracing: Failed to allocate common fields");
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-		/* The linker may leave blanks */
-		if (!call->name)
-			continue;
-		if (call->class->raw_init) {
-			ret = call->class->raw_init(call);
-			if (ret < 0) {
-				if (ret != -ENOSYS)
-					pr_warning("Could not initialize trace "
-					"point events/%s\n", call->name);
-				continue;
-			}
-		}
-		ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+		__trace_add_event_call(call, NULL, &ftrace_event_id_fops,
 				       &ftrace_enable_fops,
 				       &ftrace_event_filter_fops,
 				       &ftrace_event_format_fops);
-		if (!ret)
-			list_add(&call->list, &ftrace_events);
 	}
 
 	while (true) {
-- 
cgit v1.2.3


From d62f85d1e22e537192ce494c89540e1ac0d8bfc7 Mon Sep 17 00:00:00 2001
From: Chase Douglas <chase.douglas@canonical.com>
Date: Tue, 15 Jun 2010 12:29:15 -0400
Subject: tracing/function-graph: Use correct string size for snprintf

The nsecs_str string is a local variable defined as:

char nsecs_str[5];

It is possible for the snprintf call to use a size value larger than the
size of the string. This should not cause a buffer overrun as it is
written now due to the value for the string format "%03lu" can not be
larger than 1000. However, this change makes it correct. By making the
size correct we guard against potential future changes that could actually
cause a buffer overrun.

Signed-off-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1276619355-18116-1-git-send-email-chase.douglas@canonical.com>

[ added 'UL' to number 8 to fix gcc warning comparing it to sizeof() ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..6bff23625781 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 	/* Print nsecs (we don't want to exceed 7 numbers) */
 	if (len < 7) {
-		snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+		snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
+			 nsecs_rem);
 		ret = trace_seq_printf(s, ".%s", nsecs_str);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3


From a1d0ce8213e9ddf4046ef5ba95c55762d075f541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Jun 2010 11:22:06 -0400
Subject: tracing: Use class->reg() for all registering of events

Because kprobes and syscalls need special processing to register
events, the class->reg() method was created to handle the differences.

But instead of creating a default ->reg for perf and ftrace events,
the code was scattered with:

	if (class->reg)
		class->reg();
	else
		default_reg();

This is messy and can also lead to bugs.

This patch cleans up this code and creates a default reg() entry for
the events allowing for the code to directly call the class->reg()
without the condition.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_event_perf.c | 19 +++-----------
 kernel/trace/trace_events.c     | 55 +++++++++++++++++++++++++++--------------
 2 files changed, 39 insertions(+), 35 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6053982dc302..23751659582e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -54,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 		}
 	}
 
-	if (tp_event->class->reg)
-		ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
-	else
-		ret = tracepoint_probe_register(tp_event->name,
-						tp_event->class->perf_probe,
-						tp_event);
-
+	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
 	if (ret)
 		goto fail;
 
@@ -94,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(tp_event, &ftrace_events, list) {
 		if (tp_event->event.type == event_id &&
-		    tp_event->class &&
-		    (tp_event->class->perf_probe ||
-		     tp_event->class->reg) &&
+		    tp_event->class && tp_event->class->reg &&
 		    try_module_get(tp_event->mod)) {
 			ret = perf_trace_event_init(tp_event, p_event);
 			break;
@@ -136,12 +128,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	if (--tp_event->perf_refcount > 0)
 		goto out;
 
-	if (tp_event->class->reg)
-		tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-	else
-		tracepoint_probe_unregister(tp_event->name,
-					    tp_event->class->perf_probe,
-					    tp_event);
+	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
 
 	/*
 	 * Ensure our callback won't be called anymore. See
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 69bee4cc0e10..e8e6043f4d29 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,35 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->probe,
+						 call);
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->probe,
+					    call);
+		return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->perf_probe,
+						 call);
+	case TRACE_REG_PERF_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->perf_probe,
+					    call);
+		return 0;
+#endif
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -151,23 +180,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
 			tracing_stop_cmdline_record();
-			if (call->class->reg)
-				call->class->reg(call, TRACE_REG_UNREGISTER);
-			else
-				tracepoint_probe_unregister(call->name,
-							    call->class->probe,
-							    call);
+			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
 			tracing_start_cmdline_record();
-			if (call->class->reg)
-				ret = call->class->reg(call, TRACE_REG_REGISTER);
-			else
-				ret = tracepoint_probe_register(call->name,
-								call->class->probe,
-								call);
+			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
@@ -205,8 +224,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (match &&
@@ -332,7 +350,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
-		if (call->class && (call->class->probe || call->class->reg))
+		if (call->class && call->class->reg)
 			return call;
 	}
 
@@ -485,8 +503,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system) != 0)
@@ -977,12 +994,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		return -1;
 	}
 
-	if (call->class->probe || call->class->reg)
+	if (call->class->reg)
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
 #ifdef CONFIG_PERF_EVENTS
-	if (call->event.type && (call->class->perf_probe || call->class->reg))
+	if (call->event.type && call->class->reg)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
 #endif
-- 
cgit v1.2.3


From 64166699752006f1a23a9cf7c96ae36654ccfc2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:11 +0200
Subject: workqueue: temporarily remove workqueue tracing

Strip tracing code from workqueue and remove workqueue tracing.  This
is temporary measure till concurrency managed workqueue is complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..a0d95c1f3f82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -391,17 +391,6 @@ config KMEMTRACE
 
 	  If unsure, say N.
 
-config WORKQUEUE_TRACER
-	bool "Trace workqueues"
-	select GENERIC_TRACER
-	help
-	  The workqueue tracer provides some statistical information
-          about each cpu workqueue thread such as the number of the
-          works inserted and executed since their creation. It can help
-          to evaluate the amount of work each of them has to perform.
-          For example it can help a developer to decide whether he should
-          choose a per-cpu workqueue instead of a singlethreaded one.
-
 config BLK_DEV_IO_TRACE
 	bool "Support for tracing block IO actions"
 	depends on SYSFS
-- 
cgit v1.2.3


From e09c8614b32915c16f68e039ac7040e602d73e35 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 5 Jul 2010 15:54:45 -0300
Subject: tracing/kprobes: Support "string" type

Support string type tracing and printing in kprobe-tracer.

This allows user to trace string data in kernel including __user data. Note
that sometimes __user data may not be accessed if it is paged-out (sorry, but
kprobes operation should be done in atomic, we can not wait for page-in).

Commiter note: Fixed up conflicts with b7e2ece.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100519195724.2885.18788.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 370 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 292 insertions(+), 78 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3b831d8e201e..1b79d1c15726 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
 #include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
 #include <asm/bitsperlong.h>
 
 #include "trace.h"
@@ -38,6 +40,7 @@
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 #define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
 #define KPROBE_EVENT_SYSTEM "kprobes"
 
 /* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
 };
 
 /* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+				 void *);
 #define PRINT_TYPE_FUNC_NAME(type)	print_type_##type
 #define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type
 
 /* Printing  in basic type function template */
 #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)			\
 static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,	\
-						const char *name, void *data)\
+						const char *name,	\
+						void *data, void *ent)\
 {									\
 	return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
 }									\
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
 DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
 DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
 
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs)	\
+	(((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl)	((u32)(dl) >> 16)
+#define get_rloc_offs(dl)	((u32)(dl) & 0xffff)
+
+static inline void *get_rloc_data(u32 *dl)
+{
+	return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+	return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/*
+ * Convert data_rloc to data_loc:
+ *  data_rloc stores the offset from data_rloc itself, but data_loc
+ *  stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs)	((u32)(dl) + (offs))
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+						  const char *name,
+						  void *data, void *ent)
+{
+	int len = *(u32 *)data >> 16;
+
+	if (!len)
+		return trace_seq_printf(s, " %s=(fault)", name);
+	else
+		return trace_seq_printf(s, " %s=\"%s\"", name,
+					(const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
 /* Data fetch function type */
 typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *);
 
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
 	return fprm->fn(regs, fprm->data, dest);
 }
 
-#define FETCH_FUNC_NAME(kind, type)	fetch_##kind##_##type
+#define FETCH_FUNC_NAME(method, type)	fetch_##method##_##type
 /*
  * Define macro for basic types - we don't need to define s* types, because
  * we have to care only about bitwidth at recording time.
  */
-#define DEFINE_BASIC_FETCH_FUNCS(kind)  \
-DEFINE_FETCH_##kind(u8)			\
-DEFINE_FETCH_##kind(u16)		\
-DEFINE_FETCH_##kind(u32)		\
-DEFINE_FETCH_##kind(u64)
-
-#define CHECK_BASIC_FETCH_FUNCS(kind, fn)	\
-	((FETCH_FUNC_NAME(kind, u8) == fn) ||	\
-	 (FETCH_FUNC_NAME(kind, u16) == fn) ||	\
-	 (FETCH_FUNC_NAME(kind, u32) == fn) ||	\
-	 (FETCH_FUNC_NAME(kind, u64) == fn))
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8)		\
+DEFINE_FETCH_##method(u16)		\
+DEFINE_FETCH_##method(u32)		\
+DEFINE_FETCH_##method(u64)
+
+#define CHECK_FETCH_FUNCS(method, fn)			\
+	(((FETCH_FUNC_NAME(method, u8) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u16) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u32) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u64) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, string) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, string_size) == fn)) \
+	 && (fn != NULL))
 
 /* Data fetch function templates */
 #define DEFINE_FETCH_reg(type)						\
 static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,	\
-					  void *offset, void *dest)	\
+					void *offset, void *dest)	\
 {									\
 	*(type *)dest = (type)regs_get_register(regs,			\
 				(unsigned int)((unsigned long)offset));	\
 }
 DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
 
 #define DEFINE_FETCH_stack(type)					\
 static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
 				(unsigned int)((unsigned long)offset));	\
 }
 DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
 
 #define DEFINE_FETCH_retval(type)					\
 static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
 	*(type *)dest = (type)regs_return_value(regs);			\
 }
 DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
 
 #define DEFINE_FETCH_memory(type)					\
 static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
 		*(type *)dest = retval;					\
 }
 DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+						      void *addr, void *dest)
+{
+	long ret;
+	int maxlen = get_rloc_len(*(u32 *)dest);
+	u8 *dst = get_rloc_data(dest);
+	u8 *src = addr;
+	mm_segment_t old_fs = get_fs();
+	if (!maxlen)
+		return;
+	/*
+	 * Try to get string again, since the string can be changed while
+	 * probing.
+	 */
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+	do
+		ret = __copy_from_user_inatomic(dst++, src++, 1);
+	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+	dst[-1] = '\0';
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret < 0) {	/* Failed to fetch string */
+		((u8 *)get_rloc_data(dest))[0] = '\0';
+		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+	} else
+		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+					      get_rloc_offs(*(u32 *)dest));
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+							void *addr, void *dest)
+{
+	int ret, len = 0;
+	u8 c;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+	do {
+		ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+		len++;
+	} while (c && ret == 0 && len < MAX_STRING_SIZE);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret < 0)	/* Failed to check the length */
+		*(u32 *)dest = 0;
+	else
+		*(u32 *)dest = len;
+}
 
 /* Memory fetching by symbol */
 struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
 		*(type *)dest = 0;					\
 }
 DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
 
 /* Dereference memory access function */
 struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
 		*(type *)dest = 0;					\
 }
 DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
 
 static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-	if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+	if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
 		free_deref_fetch_param(data->orig.data);
-	else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
+	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
 		free_symbol_cache(data->orig.data);
 	kfree(data);
 }
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
 #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
 
-#define ASSIGN_FETCH_FUNC(kind, type)	\
-	.kind = FETCH_FUNC_NAME(kind, type)
-
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)	\
-	{.name = #ptype,			\
-	 .size = sizeof(ftype),			\
-	 .is_signed = sign,			\
-	 .print = PRINT_TYPE_FUNC_NAME(ptype),	\
-	 .fmt = PRINT_TYPE_FMT_NAME(ptype),	\
-ASSIGN_FETCH_FUNC(reg, ftype),			\
-ASSIGN_FETCH_FUNC(stack, ftype),		\
-ASSIGN_FETCH_FUNC(retval, ftype),		\
-ASSIGN_FETCH_FUNC(memory, ftype),		\
-ASSIGN_FETCH_FUNC(symbol, ftype),		\
-ASSIGN_FETCH_FUNC(deref, ftype),		\
+/* Fetch types */
+enum {
+	FETCH_MTD_reg = 0,
+	FETCH_MTD_stack,
+	FETCH_MTD_retval,
+	FETCH_MTD_memory,
+	FETCH_MTD_symbol,
+	FETCH_MTD_deref,
+	FETCH_MTD_END,
+};
+
+#define ASSIGN_FETCH_FUNC(method, type)	\
+	[FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)	\
+	{.name = _name,				\
+	 .size = _size,					\
+	 .is_signed = sign,				\
+	 .print = PRINT_TYPE_FUNC_NAME(ptype),		\
+	 .fmt = PRINT_TYPE_FMT_NAME(ptype),		\
+	 .fmttype = _fmttype,				\
+	 .fetch = {					\
+ASSIGN_FETCH_FUNC(reg, ftype),				\
+ASSIGN_FETCH_FUNC(stack, ftype),			\
+ASSIGN_FETCH_FUNC(retval, ftype),			\
+ASSIGN_FETCH_FUNC(memory, ftype),			\
+ASSIGN_FETCH_FUNC(symbol, ftype),			\
+ASSIGN_FETCH_FUNC(deref, ftype),			\
+	  }						\
 	}
 
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)			\
+	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
 /* Fetch type information table */
 static const struct fetch_type {
 	const char	*name;		/* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
 	int		is_signed;	/* Signed flag */
 	print_type_func_t	print;	/* Print functions */
 	const char	*fmt;		/* Fromat string */
+	const char	*fmttype;	/* Name in format file */
 	/* Fetch functions */
-	fetch_func_t	reg;
-	fetch_func_t	stack;
-	fetch_func_t	retval;
-	fetch_func_t	memory;
-	fetch_func_t	symbol;
-	fetch_func_t	deref;
+	fetch_func_t	fetch[FETCH_MTD_END];
 } fetch_type_table[] = {
+	/* Special types */
+	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+					sizeof(u32), 1, "__data_loc char[]"),
+	[FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+					string_size, sizeof(u32), 0, "u32"),
+	/* Basic types */
 	ASSIGN_FETCH_TYPE(u8,  u8,  0),
 	ASSIGN_FETCH_TYPE(u16, u16, 0),
 	ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
 	*(unsigned long *)dest = kernel_stack_pointer(regs);
 }
 
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+					    fetch_func_t orig_fn)
+{
+	int i;
+
+	if (type != &fetch_type_table[FETCH_TYPE_STRING])
+		return NULL;	/* Only string type needs size function */
+	for (i = 0; i < FETCH_MTD_END; i++)
+		if (type->fetch[i] == orig_fn)
+			return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+
+	WARN_ON(1);	/* This should not happen */
+	return NULL;
+}
+
 /**
  * Kprobe event core functions
  */
 
 struct probe_arg {
 	struct fetch_param	fetch;
+	struct fetch_param	fetch_size;
 	unsigned int		offset;	/* Offset from argument entry */
 	const char		*name;	/* Name of this argument */
 	const char		*comm;	/* Command of this argument */
@@ -429,9 +587,9 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+	if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
 		free_deref_fetch_param(arg->fetch.data);
-	else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
+	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
 	kfree(arg->name);
 	kfree(arg->comm);
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 
 	if (strcmp(arg, "retval") == 0) {
 		if (is_return)
-			f->fn = t->retval;
+			f->fn = t->fetch[FETCH_MTD_retval];
 		else
 			ret = -EINVAL;
 	} else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 			if (ret || param > PARAM_MAX_STACK)
 				ret = -EINVAL;
 			else {
-				f->fn = t->stack;
+				f->fn = t->fetch[FETCH_MTD_stack];
 				f->data = (void *)param;
 			}
 		} else
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 	case '%':	/* named register */
 		ret = regs_query_register_offset(arg + 1);
 		if (ret >= 0) {
-			f->fn = t->reg;
+			f->fn = t->fetch[FETCH_MTD_reg];
 			f->data = (void *)(unsigned long)ret;
 			ret = 0;
 		}
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 			ret = strict_strtoul(arg + 1, 0, &param);
 			if (ret)
 				break;
-			f->fn = t->memory;
+			f->fn = t->fetch[FETCH_MTD_memory];
 			f->data = (void *)param;
 		} else {
 			ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 				break;
 			f->data = alloc_symbol_cache(arg + 1, offset);
 			if (f->data)
-				f->fn = t->symbol;
+				f->fn = t->fetch[FETCH_MTD_symbol];
 		}
 		break;
 	case '+':	/* deref memory */
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 			if (ret)
 				kfree(dprm);
 			else {
-				f->fn = t->deref;
+				f->fn = t->fetch[FETCH_MTD_deref];
 				f->data = (void *)dprm;
 			}
 		}
 		break;
 	}
-	if (!ret && !f->fn)
+	if (!ret && !f->fn) {	/* Parsed, but do not find fetch method */
+		pr_info("%s type has no corresponding fetch method.\n",
+			t->name);
 		ret = -EINVAL;
+	}
 	return ret;
 }
 
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 			   struct probe_arg *parg, int is_return)
 {
 	const char *t;
+	int ret;
 
 	if (strlen(arg) > MAX_ARGSTR_LEN) {
 		pr_info("Argument is too long.: %s\n",  arg);
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 	}
 	parg->offset = tp->size;
 	tp->size += parg->type->size;
-	return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+	if (ret >= 0) {
+		parg->fetch_size.fn = get_fetch_size_function(parg->type,
+							      parg->fetch.fn);
+		parg->fetch_size.data = parg->fetch.data;
+	}
+	return ret;
 }
 
 /* Return 1 if name is reserved or already used by another argument */
@@ -1043,6 +1211,54 @@ static const struct file_operations kprobe_profile_ops = {
 	.release        = seq_release,
 };
 
+/* Sum up total data length for dynamic arraies (strings) */
+static __kprobes int __get_data_size(struct trace_probe *tp,
+				     struct pt_regs *regs)
+{
+	int i, ret = 0;
+	u32 len;
+
+	for (i = 0; i < tp->nr_args; i++)
+		if (unlikely(tp->args[i].fetch_size.fn)) {
+			call_fetch(&tp->args[i].fetch_size, regs, &len);
+			ret += len;
+		}
+
+	return ret;
+}
+
+/* Store the value of each argument */
+static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
+				       struct pt_regs *regs,
+				       u8 *data, int maxlen)
+{
+	int i;
+	u32 end = tp->size;
+	u32 *dl;	/* Data (relative) location */
+
+	for (i = 0; i < tp->nr_args; i++) {
+		if (unlikely(tp->args[i].fetch_size.fn)) {
+			/*
+			 * First, we set the relative location and
+			 * maximum data length to *dl
+			 */
+			dl = (u32 *)(data + tp->args[i].offset);
+			*dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+			/* Then try to fetch string or dynamic array data */
+			call_fetch(&tp->args[i].fetch, regs, dl);
+			/* Reduce maximum length */
+			end += get_rloc_len(*dl);
+			maxlen -= get_rloc_len(*dl);
+			/* Trick here, convert data_rloc to data_loc */
+			*dl = convert_rloc_to_loc(*dl,
+				 ent_size + tp->args[i].offset);
+		} else
+			/* Just fetching data normally */
+			call_fetch(&tp->args[i].fetch, regs,
+				   data + tp->args[i].offset);
+	}
+}
+
 /* Kprobe handler */
 static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
@@ -1050,8 +1266,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	u8 *data;
-	int size, i, pc;
+	int size, dsize, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
@@ -1060,7 +1275,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = sizeof(*entry) + tp->size;
+	dsize = __get_data_size(tp, regs);
+	size = sizeof(*entry) + tp->size + dsize;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, irq_flags, pc);
@@ -1069,9 +1285,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 
 	entry = ring_buffer_event_data(event);
 	entry->ip = (unsigned long)kp->addr;
-	data = (u8 *)&entry[1];
-	for (i = 0; i < tp->nr_args; i++)
-		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1299,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	u8 *data;
-	int size, i, pc;
+	int size, pc, dsize;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = sizeof(*entry) + tp->size;
+	dsize = __get_data_size(tp, regs);
+	size = sizeof(*entry) + tp->size + dsize;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, irq_flags, pc);
@@ -1103,9 +1317,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	entry = ring_buffer_event_data(event);
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
-	data = (u8 *)&entry[1];
-	for (i = 0; i < tp->nr_args; i++)
-		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1349,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
 	data = (u8 *)&field[1];
 	for (i = 0; i < tp->nr_args; i++)
 		if (!tp->args[i].type->print(s, tp->args[i].name,
-					     data + tp->args[i].offset))
+					     data + tp->args[i].offset, field))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1391,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 	data = (u8 *)&field[1];
 	for (i = 0; i < tp->nr_args; i++)
 		if (!tp->args[i].type->print(s, tp->args[i].name,
-					     data + tp->args[i].offset))
+					     data + tp->args[i].offset, field))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1234,7 +1446,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
 	/* Set argument names as fields */
 	for (i = 0; i < tp->nr_args; i++) {
-		ret = trace_define_field(event_call, tp->args[i].type->name,
+		ret = trace_define_field(event_call, tp->args[i].type->fmttype,
 					 tp->args[i].name,
 					 sizeof(field) + tp->args[i].offset,
 					 tp->args[i].type->size,
@@ -1256,7 +1468,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
 	/* Set argument names as fields */
 	for (i = 0; i < tp->nr_args; i++) {
-		ret = trace_define_field(event_call, tp->args[i].type->name,
+		ret = trace_define_field(event_call, tp->args[i].type->fmttype,
 					 tp->args[i].name,
 					 sizeof(field) + tp->args[i].offset,
 					 tp->args[i].type->size,
@@ -1296,8 +1508,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
 
 	for (i = 0; i < tp->nr_args; i++) {
-		pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-				tp->args[i].name);
+		if (strcmp(tp->args[i].type->name, "string") == 0)
+			pos += snprintf(buf + pos, LEN_OR_ZERO,
+					", __get_str(%s)",
+					tp->args[i].name);
+		else
+			pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+					tp->args[i].name);
 	}
 
 #undef LEN_OR_ZERO
@@ -1334,11 +1551,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
-	u8 *data;
-	int size, __size, i;
+	int size, __size, dsize;
 	int rctx;
 
-	__size = sizeof(*entry) + tp->size;
+	dsize = __get_data_size(tp, regs);
+	__size = sizeof(*entry) + tp->size + dsize;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1350,9 +1567,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 		return;
 
 	entry->ip = (unsigned long)kp->addr;
-	data = (u8 *)&entry[1];
-	for (i = 0; i < tp->nr_args; i++)
-		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+	memset(&entry[1], 0, dsize);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
 	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1366,11 +1582,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
-	u8 *data;
-	int size, __size, i;
+	int size, __size, dsize;
 	int rctx;
 
-	__size = sizeof(*entry) + tp->size;
+	dsize = __get_data_size(tp, regs);
+	__size = sizeof(*entry) + tp->size + dsize;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1383,9 +1599,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
-	data = (u8 *)&entry[1];
-	for (i = 0; i < tp->nr_args; i++)
-		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
 	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
-- 
cgit v1.2.3


From 5e3d20a68f63fc5a310687d81956c3b96e488b84 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 4 Jul 2010 00:02:26 +0200
Subject: init: Remove the BKL from startup code

I have shown by code review that no driver takes
the BKL at init time any more, so whatever the
init code was locking against is no longer there
and it is now safe to remove the BKL there.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Steven Rostedt <rostedt@goodmis>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..8047ca5a8237 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -734,13 +734,6 @@ __acquires(kernel_lock)
 		return -1;
 	}
 
-	/*
-	 * When this gets called we hold the BKL which means that
-	 * preemption is disabled. Various trace selftests however
-	 * need to disable and enable preemption for successful tests.
-	 * So we drop the BKL here and grab it after the tests again.
-	 */
-	unlock_kernel();
 	mutex_lock(&trace_types_lock);
 
 	tracing_selftest_running = true;
@@ -822,7 +815,6 @@ __acquires(kernel_lock)
 #endif
 
  out_unlock:
-	lock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5d550467b9770042e9699690907babc32104a8d4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 15 Jul 2010 23:27:35 +0200
Subject: tracing: Remove ksym tracer

The ksym (breakpoint) ftrace plugin has been superseded by perf
tools that are much more poweful to use the cpu breakpoints.
This tracer doesn't bring more feature. It has been deprecated
for a while now, lets remove it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          |  22 --
 kernel/trace/Makefile         |   1 -
 kernel/trace/trace.h          |   6 -
 kernel/trace/trace_entries.h  |  15 --
 kernel/trace/trace_ksym.c     | 508 ------------------------------------------
 kernel/trace/trace_selftest.c |  54 -----
 6 files changed, 606 deletions(-)
 delete mode 100644 kernel/trace/trace_ksym.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f669092fdead..f5306cb0afb1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -308,28 +308,6 @@ config BRANCH_TRACER
 
 	  Say N if unsure.
 
-config KSYM_TRACER
-	bool "Trace read and write access on kernel memory locations"
-	depends on HAVE_HW_BREAKPOINT
-	select TRACING
-	help
-	  This tracer helps find read and write operations on any given kernel
-	  symbol i.e. /proc/kallsyms.
-
-config PROFILE_KSYM_TRACER
-	bool "Profile all kernel memory accesses on 'watched' variables"
-	depends on KSYM_TRACER
-	help
-	  This tracer profiles kernel accesses on variables watched through the
-	  ksym tracer ftrace plugin. Depending upon the hardware, all read
-	  and write operations on kernel variables can be monitored for
-	  accesses.
-
-	  The results will be displayed in:
-	  /debugfs/tracing/profile_ksym
-
-	  Say N if unsure.
-
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 469a1c7555a5..84b2c9908dae 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,7 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc90ccdad469..84d3f123e86f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,7 +30,6 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BLK,
-	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -200,7 +199,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -361,8 +359,6 @@ int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
 
-extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
-
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_thresh;
@@ -436,8 +432,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_ksym(struct tracer *trace,
-					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 13abc157dbaf..84128371f254 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -291,18 +291,3 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
-
-	TRACE_KSYM,
-
-	F_STRUCT(
-		__field(	unsigned long,	ip			  )
-		__field(	unsigned char,	type			  )
-		__array(	char	     ,	cmd,	   TASK_COMM_LEN  )
-		__field(	unsigned long,  addr			  )
-	),
-
-	F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
-		(void *)__entry->ip, (unsigned int)__entry->type,
-		(void *)__entry->addr,  __entry->cmd)
-);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * trace_ksym.c - Kernel Symbol Tracer
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2009
- */
-
-#include <linux/kallsyms.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-#include <linux/hw_breakpoint.h>
-#include <asm/hw_breakpoint.h>
-
-#include <asm/atomic.h>
-
-#define KSYM_TRACER_OP_LEN 3 /* rw- */
-
-struct trace_ksym {
-	struct perf_event	**ksym_hbp;
-	struct perf_event_attr	attr;
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	atomic64_t		counter;
-#endif
-	struct hlist_node	ksym_hlist;
-};
-
-static struct trace_array *ksym_trace_array;
-
-static unsigned int ksym_tracing_enabled;
-
-static HLIST_HEAD(ksym_filter_head);
-
-static DEFINE_MUTEX(ksym_tracer_mutex);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-
-#define MAX_UL_INT 0xffffffff
-
-void ksym_collect_stats(unsigned long hbp_hit_addr)
-{
-	struct hlist_node *node;
-	struct trace_ksym *entry;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->attr.bp_addr == hbp_hit_addr) {
-			atomic64_inc(&entry->counter);
-			break;
-		}
-	}
-	rcu_read_unlock();
-}
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-void ksym_hbp_handler(struct perf_event *hbp, int nmi,
-		      struct perf_sample_data *data,
-		      struct pt_regs *regs)
-{
-	struct ring_buffer_event *event;
-	struct ksym_trace_entry *entry;
-	struct ring_buffer *buffer;
-	int pc;
-
-	if (!ksym_tracing_enabled)
-		return;
-
-	buffer = ksym_trace_array->buffer;
-
-	pc = preempt_count();
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
-							sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-
-	entry		= ring_buffer_event_data(event);
-	entry->ip	= instruction_pointer(regs);
-	entry->type	= hw_breakpoint_type(hbp);
-	entry->addr	= hw_breakpoint_addr(hbp);
-	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	ksym_collect_stats(hw_breakpoint_addr(hbp));
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-	trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-/* Valid access types are represented as
- *
- * rw- : Set Read/Write Access Breakpoint
- * -w- : Set Write Access Breakpoint
- * --- : Clear Breakpoints
- * --x : Set Execution Break points (Not available yet)
- *
- */
-static int ksym_trace_get_access_type(char *str)
-{
-	int access = 0;
-
-	if (str[0] == 'r')
-		access |= HW_BREAKPOINT_R;
-
-	if (str[1] == 'w')
-		access |= HW_BREAKPOINT_W;
-
-	if (str[2] == 'x')
-		access |= HW_BREAKPOINT_X;
-
-	switch (access) {
-	case HW_BREAKPOINT_R:
-	case HW_BREAKPOINT_W:
-	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-		return access;
-	default:
-		return -EINVAL;
-	}
-}
-
-/*
- * There can be several possible malformed requests and we attempt to capture
- * all of them. We enumerate some of the rules
- * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
- *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
- *    <module>:<ksym_name>:<op>.
- * 2. No delimiter symbol ':' in the input string
- * 3. Spurious operator symbols or symbols not in their respective positions
- * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
- * 5. Kernel symbol not a part of /proc/kallsyms
- * 6. Duplicate requests
- */
-static int parse_ksym_trace_str(char *input_string, char **ksymname,
-							unsigned long *addr)
-{
-	int ret;
-
-	*ksymname = strsep(&input_string, ":");
-	*addr = kallsyms_lookup_name(*ksymname);
-
-	/* Check for malformed request: (2), (1) and (5) */
-	if ((!input_string) ||
-	    (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
-	    (*addr == 0))
-		return -EINVAL;;
-
-	ret = ksym_trace_get_access_type(input_string);
-
-	return ret;
-}
-
-int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
-{
-	struct trace_ksym *entry;
-	int ret = -ENOMEM;
-
-	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	hw_breakpoint_init(&entry->attr);
-
-	entry->attr.bp_type = op;
-	entry->attr.bp_addr = addr;
-	entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
-
-	entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
-					ksym_hbp_handler);
-
-	if (IS_ERR(entry->ksym_hbp)) {
-		ret = PTR_ERR(entry->ksym_hbp);
-		if (ret == -ENOSPC) {
-			printk(KERN_ERR "ksym_tracer: Maximum limit reached."
-			" No new requests for tracing can be accepted now.\n");
-		} else {
-			printk(KERN_INFO "ksym_tracer request failed. Try again"
-					 " later!!\n");
-		}
-		goto err;
-	}
-
-	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-
-	return 0;
-
-err:
-	kfree(entry);
-
-	return ret;
-}
-
-static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
-						size_t count, loff_t *ppos)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node;
-	struct trace_seq *s;
-	ssize_t cnt = 0;
-	int ret;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-	trace_seq_init(s);
-
-	mutex_lock(&ksym_tracer_mutex);
-
-	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%pS:",
-				(void *)(unsigned long)entry->attr.bp_addr);
-		if (entry->attr.bp_type == HW_BREAKPOINT_R)
-			ret = trace_seq_puts(s, "r--\n");
-		else if (entry->attr.bp_type == HW_BREAKPOINT_W)
-			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
-			ret = trace_seq_puts(s, "rw-\n");
-		WARN_ON_ONCE(!ret);
-	}
-
-	cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
-
-	mutex_unlock(&ksym_tracer_mutex);
-
-	kfree(s);
-
-	return cnt;
-}
-
-static void __ksym_trace_reset(void)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node, *node1;
-
-	mutex_lock(&ksym_tracer_mutex);
-	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
-								ksym_hlist) {
-		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		hlist_del_rcu(&(entry->ksym_hlist));
-		synchronize_rcu();
-		kfree(entry);
-	}
-	mutex_unlock(&ksym_tracer_mutex);
-}
-
-static ssize_t ksym_trace_filter_write(struct file *file,
-					const char __user *buffer,
-						size_t count, loff_t *ppos)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node;
-	char *buf, *input_string, *ksymname = NULL;
-	unsigned long ksym_addr = 0;
-	int ret, op, changed = 0;
-
-	buf = kzalloc(count + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(buf, buffer, count))
-		goto out;
-
-	buf[count] = '\0';
-	input_string = strstrip(buf);
-
-	/*
-	 * Clear all breakpoints if:
-	 * 1: echo > ksym_trace_filter
-	 * 2: echo 0 > ksym_trace_filter
-	 * 3: echo "*:---" > ksym_trace_filter
-	 */
-	if (!input_string[0] || !strcmp(input_string, "0") ||
-	    !strcmp(input_string, "*:---")) {
-		__ksym_trace_reset();
-		ret = 0;
-		goto out;
-	}
-
-	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
-	if (ret < 0)
-		goto out;
-
-	mutex_lock(&ksym_tracer_mutex);
-
-	ret = -EINVAL;
-	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->attr.bp_addr == ksym_addr) {
-			/* Check for malformed request: (6) */
-			if (entry->attr.bp_type != op)
-				changed = 1;
-			else
-				goto out_unlock;
-			break;
-		}
-	}
-	if (changed) {
-		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		entry->attr.bp_type = op;
-		ret = 0;
-		if (op > 0) {
-			entry->ksym_hbp =
-				register_wide_hw_breakpoint(&entry->attr,
-					ksym_hbp_handler);
-			if (IS_ERR(entry->ksym_hbp))
-				ret = PTR_ERR(entry->ksym_hbp);
-			else
-				goto out_unlock;
-		}
-		/* Error or "symbol:---" case: drop it */
-		hlist_del_rcu(&(entry->ksym_hlist));
-		synchronize_rcu();
-		kfree(entry);
-		goto out_unlock;
-	} else {
-		/* Check for malformed request: (4) */
-		if (op)
-			ret = process_new_ksym_entry(ksymname, op, ksym_addr);
-	}
-out_unlock:
-	mutex_unlock(&ksym_tracer_mutex);
-out:
-	kfree(buf);
-	return !ret ? count : ret;
-}
-
-static const struct file_operations ksym_tracing_fops = {
-	.open		= tracing_open_generic,
-	.read		= ksym_trace_filter_read,
-	.write		= ksym_trace_filter_write,
-};
-
-static void ksym_trace_reset(struct trace_array *tr)
-{
-	ksym_tracing_enabled = 0;
-	__ksym_trace_reset();
-}
-
-static int ksym_trace_init(struct trace_array *tr)
-{
-	int cpu, ret = 0;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-	ksym_tracing_enabled = 1;
-	ksym_trace_array = tr;
-
-	return ret;
-}
-
-static void ksym_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m,
-		 "#       TASK-PID   CPU#      Symbol                    "
-		 "Type    Function\n");
-	seq_puts(m,
-		 "#          |        |          |                       "
-		 " |         |\n");
-}
-
-static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct ksym_trace_entry *field;
-	char str[KSYM_SYMBOL_LEN];
-	int ret;
-
-	if (entry->type != TRACE_KSYM)
-		return TRACE_TYPE_UNHANDLED;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
-				entry->pid, iter->cpu, (char *)field->addr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	switch (field->type) {
-	case HW_BREAKPOINT_R:
-		ret = trace_seq_printf(s, " R  ");
-		break;
-	case HW_BREAKPOINT_W:
-		ret = trace_seq_printf(s, " W  ");
-		break;
-	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-		ret = trace_seq_printf(s, " RW ");
-		break;
-	default:
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	sprint_symbol(str, field->ip);
-	ret = trace_seq_printf(s, "%s\n", str);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-struct tracer ksym_tracer __read_mostly =
-{
-	.name		= "ksym_tracer",
-	.init		= ksym_trace_init,
-	.reset		= ksym_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_ksym,
-#endif
-	.print_header   = ksym_trace_print_header,
-	.print_line	= ksym_trace_output
-};
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-static int ksym_profile_show(struct seq_file *m, void *v)
-{
-	struct hlist_node *node;
-	struct trace_ksym *entry;
-	int access_type = 0;
-	char fn_name[KSYM_NAME_LEN];
-
-	seq_puts(m, "  Access Type ");
-	seq_puts(m, "  Symbol                                       Counter\n");
-	seq_puts(m, "  ----------- ");
-	seq_puts(m, "  ------                                       -------\n");
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-
-		access_type = entry->attr.bp_type;
-
-		switch (access_type) {
-		case HW_BREAKPOINT_R:
-			seq_puts(m, "  R           ");
-			break;
-		case HW_BREAKPOINT_W:
-			seq_puts(m, "  W           ");
-			break;
-		case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-			seq_puts(m, "  RW          ");
-			break;
-		default:
-			seq_puts(m, "  NA          ");
-		}
-
-		if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
-			seq_printf(m, "  %-36s", fn_name);
-		else
-			seq_printf(m, "  %-36s", "<NA>");
-		seq_printf(m, " %15llu\n",
-			   (unsigned long long)atomic64_read(&entry->counter));
-	}
-	rcu_read_unlock();
-
-	return 0;
-}
-
-static int ksym_profile_open(struct inode *node, struct file *file)
-{
-	return single_open(file, ksym_profile_show, NULL);
-}
-
-static const struct file_operations ksym_profile_fops = {
-	.open		= ksym_profile_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-__init static int init_ksym_trace(void)
-{
-	struct dentry *d_tracer;
-
-	d_tracer = tracing_init_dentry();
-
-	trace_create_file("ksym_trace_filter", 0644, d_tracer,
-			  NULL, &ksym_tracing_fops);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	trace_create_file("ksym_profile", 0444, d_tracer,
-			  NULL, &ksym_profile_fops);
-#endif
-
-	return register_tracer(&ksym_tracer);
-}
-device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..39a5ca4cf15b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -755,56 +754,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_KSYM_TRACER
-static int ksym_selftest_dummy;
-
-int
-trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	ksym_selftest_dummy = 0;
-	/* Register the read-write tracing request */
-
-	ret = process_new_ksym_entry("ksym_selftest_dummy",
-				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
-					(unsigned long)(&ksym_selftest_dummy));
-
-	if (ret < 0) {
-		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
-		goto ret_path;
-	}
-	/* Perform a read and a write operation over the dummy variable to
-	 * trigger the tracer
-	 */
-	if (ksym_selftest_dummy == 0)
-		ksym_selftest_dummy++;
-
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	/* read & write operations - one each is performed on the dummy variable
-	 * triggering two entries in the trace buffer
-	 */
-	if (!ret && count != 2) {
-		printk(KERN_CONT "Ksym tracer startup test failed");
-		ret = -1;
-	}
-
-ret_path:
-	return ret;
-}
-#endif /* CONFIG_KSYM_TRACER */
-
-- 
cgit v1.2.3


From f376bf5ffbad863d4bc3b2586b7e34cdf756ad17 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 16 Jul 2010 00:26:26 +0200
Subject: tracing: Remove sysprof ftrace plugin

The sysprof ftrace plugin doesn't seem to be seriously used
somewhere. There is a branch in the sysprof tree that makes
an interface to it, but the real sysprof tool uses either its
own module or perf events.

Drop the sysprof ftrace plugin then, as it's mostly useless.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Soeren Sandmann <sandmann@daimi.au.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/Kconfig          |   9 --
 kernel/trace/Makefile         |   1 -
 kernel/trace/trace.c          |   3 -
 kernel/trace/trace.h          |   3 -
 kernel/trace/trace_selftest.c |  32 ----
 kernel/trace/trace_sysprof.c  | 330 ------------------------------------------
 6 files changed, 378 deletions(-)
 delete mode 100644 kernel/trace/trace_sysprof.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f5306cb0afb1..c7683fd8a03a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
-config SYSPROF_TRACER
-	bool "Sysprof Tracer"
-	depends on X86
-	select GENERIC_TRACER
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer provides the trace needed by the 'Sysprof' userspace
-	  tool.
-
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 84b2c9908dae..438e84a56ab3 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
-obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8683dec6946b..78a49e67f7db 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4354,9 +4354,6 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
-#ifdef CONFIG_SYSPROF_TRACER
-	init_tracer_sysprof_debugfs(d_tracer);
-#endif
 
 	create_trace_options_dir();
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 84d3f123e86f..2114b4c1150f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -296,7 +296,6 @@ struct dentry *trace_create_file(const char *name,
 				 const struct file_operations *fops);
 
 struct dentry *tracing_init_dentry(void);
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct ring_buffer_event;
 
@@ -428,8 +427,6 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
-extern int trace_selftest_startup_sysprof(struct tracer *trace,
-					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 39a5ca4cf15b..6ed05ee6cbc7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -690,38 +690,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
-#ifdef CONFIG_SYSPROF_TRACER
-int
-trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/* Sleep for a 1/10 of a second */
-	msleep(100);
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_SYSPROF_TRACER */
-
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index c080956f4d8e..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * trace stack traces
- *
- * Copyright (C) 2004-2008, Soeren Sandmann
- * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
- * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/hrtimer.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/fs.h>
-
-#include <asm/stacktrace.h>
-
-#include "trace.h"
-
-static struct trace_array	*sysprof_trace;
-static int __read_mostly	tracer_enabled;
-
-/*
- * 1 msec sample interval by default:
- */
-static unsigned long sample_period = 1000000;
-static const unsigned int sample_max_depth = 512;
-
-static DEFINE_MUTEX(sample_timer_lock);
-/*
- * Per CPU hrtimers that do the profiling:
- */
-static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
-
-struct stack_frame_user {
-	const void __user	*next_fp;
-	unsigned long		return_address;
-};
-
-static int
-copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
-{
-	int ret;
-
-	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
-		return 0;
-
-	ret = 1;
-	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
-		ret = 0;
-	pagefault_enable();
-
-	return ret;
-}
-
-struct backtrace_info {
-	struct trace_array_cpu	*data;
-	struct trace_array	*tr;
-	int			pos;
-};
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
-	/* Don't bother with IRQ stacks for now */
-	return -1;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct backtrace_info *info = data;
-
-	if (info->pos < sample_max_depth && reliable) {
-		__trace_special(info->tr, info->data, 1, addr, 0);
-
-		info->pos++;
-	}
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-	.walk_stack		= print_context_stack,
-};
-
-static int
-trace_kernel(struct pt_regs *regs, struct trace_array *tr,
-	     struct trace_array_cpu *data)
-{
-	struct backtrace_info info;
-	unsigned long bp;
-	char *stack;
-
-	info.tr = tr;
-	info.data = data;
-	info.pos = 1;
-
-	__trace_special(info.tr, info.data, 1, regs->ip, 0);
-
-	stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
-	bp = regs->bp;
-#else
-	bp = 0;
-#endif
-
-	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
-
-	return info.pos;
-}
-
-static void timer_notify(struct pt_regs *regs, int cpu)
-{
-	struct trace_array_cpu *data;
-	struct stack_frame_user frame;
-	struct trace_array *tr;
-	const void __user *fp;
-	int is_user;
-	int i;
-
-	if (!regs)
-		return;
-
-	tr = sysprof_trace;
-	data = tr->data[cpu];
-	is_user = user_mode(regs);
-
-	if (!current || current->pid == 0)
-		return;
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	__trace_special(tr, data, 0, 0, current->pid);
-
-	if (!is_user)
-		i = trace_kernel(regs, tr, data);
-	else
-		i = 0;
-
-	/*
-	 * Trace user stack if we are not a kernel thread
-	 */
-	if (current->mm && i < sample_max_depth) {
-		regs = (struct pt_regs *)current->thread.sp0 - 1;
-
-		fp = (void __user *)regs->bp;
-
-		__trace_special(tr, data, 2, regs->ip, 0);
-
-		while (i < sample_max_depth) {
-			frame.next_fp = NULL;
-			frame.return_address = 0;
-			if (!copy_stack_frame(fp, &frame))
-				break;
-			if ((unsigned long)fp < regs->sp)
-				break;
-
-			__trace_special(tr, data, 2, frame.return_address,
-					(unsigned long)fp);
-			fp = frame.next_fp;
-
-			i++;
-		}
-
-	}
-
-	/*
-	 * Special trace entry if we overflow the max depth:
-	 */
-	if (i == sample_max_depth)
-		__trace_special(tr, data, -1, -1, -1);
-
-	__trace_special(tr, data, 3, current->pid, i);
-}
-
-static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-{
-	/* trace here */
-	timer_notify(get_irq_regs(), smp_processor_id());
-
-	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
-
-	return HRTIMER_RESTART;
-}
-
-static void start_stack_timer(void *unused)
-{
-	struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
-
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer->function = stack_trace_timer_fn;
-
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-		      HRTIMER_MODE_REL_PINNED);
-}
-
-static void start_stack_timers(void)
-{
-	on_each_cpu(start_stack_timer, NULL, 1);
-}
-
-static void stop_stack_timer(int cpu)
-{
-	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
-
-	hrtimer_cancel(hrtimer);
-}
-
-static void stop_stack_timers(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		stop_stack_timer(cpu);
-}
-
-static void stop_stack_trace(struct trace_array *tr)
-{
-	mutex_lock(&sample_timer_lock);
-	stop_stack_timers();
-	tracer_enabled = 0;
-	mutex_unlock(&sample_timer_lock);
-}
-
-static int stack_trace_init(struct trace_array *tr)
-{
-	sysprof_trace = tr;
-
-	tracing_start_cmdline_record();
-
-	mutex_lock(&sample_timer_lock);
-	start_stack_timers();
-	tracer_enabled = 1;
-	mutex_unlock(&sample_timer_lock);
-	return 0;
-}
-
-static void stack_trace_reset(struct trace_array *tr)
-{
-	tracing_stop_cmdline_record();
-	stop_stack_trace(tr);
-}
-
-static struct tracer stack_trace __read_mostly =
-{
-	.name		= "sysprof",
-	.init		= stack_trace_init,
-	.reset		= stack_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_sysprof,
-#endif
-};
-
-__init static int init_stack_trace(void)
-{
-	return register_tracer(&stack_trace);
-}
-device_initcall(init_stack_trace);
-
-#define MAX_LONG_DIGITS 22
-
-static ssize_t
-sysprof_sample_read(struct file *filp, char __user *ubuf,
-		    size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_LONG_DIGITS];
-	int r;
-
-	r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-sysprof_sample_write(struct file *filp, const char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_LONG_DIGITS];
-	unsigned long val;
-
-	if (cnt > MAX_LONG_DIGITS-1)
-		cnt = MAX_LONG_DIGITS-1;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	val = simple_strtoul(buf, NULL, 10);
-	/*
-	 * Enforce a minimum sample period of 100 usecs:
-	 */
-	if (val < 100)
-		val = 100;
-
-	mutex_lock(&sample_timer_lock);
-	stop_stack_timers();
-	sample_period = val * 1000;
-	start_stack_timers();
-	mutex_unlock(&sample_timer_lock);
-
-	return cnt;
-}
-
-static const struct file_operations sysprof_sample_fops = {
-	.read		= sysprof_sample_read,
-	.write		= sysprof_sample_write,
-};
-
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
-{
-
-	trace_create_file("sysprof_sample_period", 0644,
-			d_tracer, NULL, &sysprof_sample_fops);
-}
-- 
cgit v1.2.3


From eb7beb5c09af75494234ea6acd09d0a647cf7338 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 16 Jul 2010 00:50:03 +0200
Subject: tracing: Remove special traces

Special traces type was only used by sysprof. Lets remove it now
that sysprof ftrace plugin has been dropped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Soeren Sandmann <sandmann@daimi.au.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace.c          | 55 ------------------------------------
 kernel/trace/trace.h          |  7 -----
 kernel/trace/trace_entries.h  | 17 -----------
 kernel/trace/trace_output.c   | 66 -------------------------------------------
 kernel/trace/trace_selftest.c |  1 -
 5 files changed, 146 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 78a49e67f7db..d9a4aa02c384 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1331,61 +1331,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 #endif /* CONFIG_STACKTRACE */
 
-static void
-ftrace_trace_special(void *__tr,
-		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
-		     int pc)
-{
-	struct ftrace_event_call *call = &event_special;
-	struct ring_buffer_event *event;
-	struct trace_array *tr = __tr;
-	struct ring_buffer *buffer = tr->buffer;
-	struct special_entry *entry;
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
-					  sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->arg1			= arg1;
-	entry->arg2			= arg2;
-	entry->arg3			= arg3;
-
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	int cpu;
-	int pc;
-
-	if (tracing_disabled)
-		return;
-
-	pc = preempt_count();
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 /**
  * trace_vbprintk - write binary msg to tracing buffer
  *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2114b4c1150f..638a5887e2ec 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,6 @@ enum trace_type {
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_BPRINT,
-	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BRANCH,
@@ -189,7 +188,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
-		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
@@ -332,11 +330,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long arg1,
-		   unsigned long arg2,
-		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 84128371f254..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -150,23 +150,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 		)
 );
 
-/*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-
-	TRACE_SPECIAL,
-
-	F_STRUCT(
-		__field(	unsigned long,	arg1	)
-		__field(	unsigned long,	arg2	)
-		__field(	unsigned long,	arg3	)
-	),
-
-	F_printk("(%08lx) (%08lx) (%08lx)",
-		 __entry->arg1, __entry->arg2, __entry->arg3)
-);
-
 /*
  * Stack-trace entry:
  */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..a46197b80b7f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1069,65 +1069,6 @@ static struct trace_event trace_wake_event = {
 	.funcs		= &trace_wake_funcs,
 };
 
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
-					     int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
-			      field->arg1,
-			      field->arg2,
-			      field->arg3))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_FIELD_RET(s, field->arg1);
-	SEQ_PUT_FIELD_RET(s, field->arg2);
-	SEQ_PUT_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event_functions trace_special_funcs = {
-	.trace		= trace_special_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
-};
-
-static struct trace_event trace_special_event = {
-	.type		= TRACE_SPECIAL,
-	.funcs		= &trace_special_funcs,
-};
-
 /* TRACE_STACK */
 
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1102,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_stack_funcs = {
 	.trace		= trace_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_stack_event = {
@@ -1194,9 +1132,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_user_stack_funcs = {
 	.trace		= trace_user_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1249,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
 	&trace_wake_event,
-	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_bprint_event,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 6ed05ee6cbc7..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,7 +13,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_WAKE:
 	case TRACE_STACK:
 	case TRACE_PRINT:
-	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-- 
cgit v1.2.3


From b444786f1a797a7f84e2561346a670649f9c7b3c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Jul 2010 23:40:11 +0200
Subject: tracing: Use generic_file_llseek for debugfs

The default for llseek will change to no_llseek,
so the tracing debugfs files need to add explicit
.llseek assignments. Since we're dealing with regular
files from a VFS perspective, use generic_file_llseek.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: John Kacur <jkacur@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1278538820-1392-10-git-send-email-arnd@arndb.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d9a4aa02c384..c1752dac613e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2338,6 +2338,7 @@ static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
 	.release	= seq_release,
+	.llseek		= seq_lseek,
 };
 
 /*
@@ -2431,6 +2432,7 @@ static const struct file_operations tracing_cpumask_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_cpumask_read,
 	.write		= tracing_cpumask_write,
+	.llseek		= generic_file_llseek,
 };
 
 static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2597,6 +2599,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_readme_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_readme_read,
+	.llseek		= generic_file_llseek,
 };
 
 static ssize_t
@@ -2647,6 +2650,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
 static const struct file_operations tracing_saved_cmdlines_fops = {
     .open       = tracing_open_generic,
     .read       = tracing_saved_cmdlines_read,
+    .llseek	= generic_file_llseek,
 };
 
 static ssize_t
@@ -2976,6 +2980,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
 
+	nonseekable_open(inode, filp);
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
@@ -3534,18 +3539,21 @@ static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_ctrl_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_ctrl_read,
 	.write		= tracing_ctrl_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations set_tracer_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_pipe_fops = {
@@ -3554,17 +3562,20 @@ static const struct file_operations tracing_pipe_fops = {
 	.read		= tracing_read_pipe,
 	.splice_read	= tracing_splice_read_pipe,
 	.release	= tracing_release_pipe,
+	.llseek		= no_llseek,
 };
 
 static const struct file_operations tracing_entries_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_mark_fops = {
 	.open		= tracing_open_generic,
 	.write		= tracing_mark_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations trace_clock_fops = {
@@ -3870,6 +3881,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_stats_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_stats_read,
+	.llseek		= generic_file_llseek,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -3906,6 +3918,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_read_dyn_info,
+	.llseek		= generic_file_llseek,
 };
 #endif
 
@@ -4059,6 +4072,7 @@ static const struct file_operations trace_options_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_read,
 	.write = trace_options_write,
+	.llseek	= generic_file_llseek,
 };
 
 static ssize_t
@@ -4110,6 +4124,7 @@ static const struct file_operations trace_options_core_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_core_read,
 	.write = trace_options_core_write,
+	.llseek = generic_file_llseek,
 };
 
 struct dentry *trace_create_file(const char *name,
-- 
cgit v1.2.3


From e870e9a1240bcef1157ffaaf71dac63362e71904 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Jul 2010 11:07:32 +0800
Subject: tracing: Allow to disable cmdline recording

We found that even enabling a single trace event that will rarely be
triggered can add big overhead to context switch.

(lmbench context switch test)
 -------------------------------------------------
 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
 ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
------ ------ ------ ------ ------ ------- -------
  2.19   2.3   2.21   2.56   2.13     2.54    2.07
  2.39   2.51  2.35   2.75   2.27     2.81    2.24

The overhead is 6% ~ 11%.

It's because when a trace event is enabled 3 tracepoints (sched_switch,
sched_wakeup, sched_wakeup_new) will be activated to map pid to cmdname.

We'd like to avoid this overhead, so add a trace option '(no)record-cmd'
to allow to disable cmdline recording.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4C2D57F4.2050204@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  6 +++++-
 kernel/trace/trace.h        |  3 +++
 kernel/trace/trace_events.c | 30 ++++++++++++++++++++++++++++--
 3 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8683dec6946b..af9042977c08 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +428,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"sleep-time",
 	"graph-time",
+	"record-cmd",
 	NULL
 };
 
@@ -2561,6 +2562,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 		trace_flags |= mask;
 	else
 		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_RECORD_CMD)
+		trace_event_enable_cmd_record(enabled);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 84d3f123e86f..7778f067fc8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -591,6 +591,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x20000,
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
+	TRACE_ITER_RECORD_CMD		= 0x100000,
 };
 
 /*
@@ -723,6 +724,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+extern void trace_event_enable_cmd_record(bool enable);
+
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e8e6043f4d29..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -170,6 +170,26 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
+void trace_event_enable_cmd_record(bool enable)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+			continue;
+
+		if (enable) {
+			tracing_start_cmdline_record();
+			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+		} else {
+			tracing_stop_cmdline_record();
+			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+		}
+	}
+	mutex_unlock(&event_mutex);
+}
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -179,13 +199,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
-			tracing_stop_cmdline_record();
+			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+				tracing_stop_cmdline_record();
+				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
-			tracing_start_cmdline_record();
+			if (trace_flags & TRACE_ITER_RECORD_CMD) {
+				tracing_start_cmdline_record();
+				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
-- 
cgit v1.2.3


From 985023dee6e212493831431ba2e3ce8918f001b2 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 25 Mar 2010 11:27:36 +0000
Subject: trace: Reorder struct ring_buffer_per_cpu to remove padding on 64bit

Reorder structure to remove 8 bytes of padding on 64 bit builds.
This shrinks the size to 128 bytes so allowing allocation from a smaller
slab & needed one fewer cache lines.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
LKML-Reference: <1269516456.2054.8.camel@localhost>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28d0615a513f..3632ce87674f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
  */
 struct ring_buffer_per_cpu {
 	int				cpu;
+	atomic_t			record_disabled;
 	struct ring_buffer		*buffer;
 	spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
 	unsigned long			read;
 	u64				write_stamp;
 	u64				read_stamp;
-	atomic_t			record_disabled;
 };
 
 struct ring_buffer {
-- 
cgit v1.2.3


From bc289ae98b75d93228d24f521ef02a076e506e94 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 3 Jun 2010 18:26:24 +0800
Subject: tracing: Reduce latency and remove percpu trace_seq

__print_flags() and __print_symbolic() use percpu trace_seq:

1) Its memory is allocated at compile time, it wastes memory if we don't use tracing.
2) It is percpu data and it wastes more memory for multi-cpus system.
3) It disables preemption when it executes its core routine
   "trace_seq_printf(s, "%s: ", #call);" and introduces latency.

So we move this trace_seq to struct trace_iterator.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4C078350.7090106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..1ba64d3cc567 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-- 
cgit v1.2.3


From ef710e100c1068d3dd5774d2b34c5485219e06ce Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 1 Jul 2010 14:34:35 +0900
Subject: tracing: Shrink max latency ringbuffer if unnecessary

Documentation/trace/ftrace.txt says

  buffer_size_kb:

        This sets or displays the number of kilobytes each CPU
        buffer can hold. The tracer buffers are the same size
        for each CPU. The displayed number is the size of the
        CPU buffer and not total size of all buffers. The
        trace buffers are allocated in pages (blocks of memory
        that the kernel uses for allocation, usually 4 KB in size).
        If the last page allocated has room for more bytes
        than requested, the rest of the page will be used,
        making the actual allocation bigger than requested.
        ( Note, the size may not be a multiple of the page size
          due to buffer management overhead. )

        This can only be updated when the current_tracer
        is set to "nop".

But it's incorrect. currently total memory consumption is
'buffer_size_kb x CPUs x 2'.

Why two times difference is there? because ftrace implicitly allocate
the buffer for max latency too.

That makes sad result when admin want to use large buffer. (If admin
want full logging and makes detail analysis). example, If admin
have 24 CPUs machine and write 200MB to buffer_size_kb, the system
consume ~10GB memory (200MB x 24 x 2). umm.. 5GB memory waste is
usually unacceptable.

Fortunatelly, almost all users don't use max latency feature.
The max latency buffer can be disabled easily.

This patch shrink buffer size of the max latency buffer if
unnecessary.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <20100701104554.DA2D.A69D9226@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 38 ++++++++++++++++++++++++++++++++------
 kernel/trace/trace.h              |  1 +
 kernel/trace/trace_irqsoff.c      |  3 +++
 kernel/trace/trace_sched_wakeup.c |  2 ++
 4 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index af9042977c08..f7488f44d26b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -660,6 +660,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
+	if (!current_trace->use_max_tr) {
+		WARN_ON_ONCE(1);
+		return;
+	}
 	arch_spin_lock(&ftrace_max_lock);
 
 	tr->buffer = max_tr.buffer;
@@ -686,6 +690,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
+	if (!current_trace->use_max_tr) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	arch_spin_lock(&ftrace_max_lock);
 
 	ftrace_disable_cpu();
@@ -2801,6 +2810,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
 	if (ret < 0)
 		return ret;
 
+	if (!current_trace->use_max_tr)
+		goto out;
+
 	ret = ring_buffer_resize(max_tr.buffer, size);
 	if (ret < 0) {
 		int r;
@@ -2828,11 +2840,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
 		return ret;
 	}
 
+	max_tr.entries = size;
+ out:
 	global_trace.entries = size;
 
 	return ret;
 }
 
+
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
  *
@@ -2893,12 +2908,26 @@ static int tracing_set_tracer(const char *buf)
 	trace_branch_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
-
+	if (current_trace && current_trace->use_max_tr) {
+		/*
+		 * We don't free the ring buffer. instead, resize it because
+		 * The max_tr ring buffer has some state (e.g. ring->clock) and
+		 * we want preserve it.
+		 */
+		ring_buffer_resize(max_tr.buffer, 1);
+		max_tr.entries = 1;
+	}
 	destroy_trace_option_files(topts);
 
 	current_trace = t;
 
 	topts = create_trace_option_files(current_trace);
+	if (current_trace->use_max_tr) {
+		ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+		if (ret < 0)
+			goto out;
+		max_tr.entries = global_trace.entries;
+	}
 
 	if (t->init) {
 		ret = tracer_init(t, tr);
@@ -3480,7 +3509,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	}
 
 	tracing_start();
-	max_tr.entries = global_trace.entries;
 	mutex_unlock(&trace_types_lock);
 
 	return cnt;
@@ -4578,16 +4606,14 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(ring_buf_size,
-					     TRACE_BUFFER_FLAGS);
+	max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
 		ring_buffer_free(global_trace.buffer);
 		goto out_free_cpumask;
 	}
-	max_tr.entries = ring_buffer_size(max_tr.buffer);
-	WARN_ON(max_tr.entries != global_trace.entries);
+	max_tr.entries = 1;
 #endif
 
 	/* Allocate the first page for all buffers */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7778f067fc8b..cb629b3b108c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -276,6 +276,7 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags	*flags;
+	int			use_max_tr;
 };
 
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
 	.open           = irqsoff_trace_open,
 	.close          = irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 #endif
 	.open		= irqsoff_trace_open,
 	.close		= irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 #endif
 	.open		= irqsoff_trace_open,
 	.close		= irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c9fd5bd02036..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -382,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.use_max_tr	= 1,
 };
 
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -396,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.use_max_tr	= 1,
 };
 
 __init static int init_wakeup_tracer(void)
-- 
cgit v1.2.3


From 24a461d537f49f9da6533d83100999ea08c6c755 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 10 Jul 2010 12:06:44 +0200
Subject: trace: strlen() return doesn't account for the NULL

We need to add one to the strlen() return because of the NULL
character.  The type->name here generally comes from the kernel and I
don't think any of them come close to being MAX_TRACER_SIZE (100)
characters long so this is basically a cleanup.

Signed-off-by: Dan Carpenter <error27@gmail.com>
LKML-Reference: <20100710100644.GV19184@bicker>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f7488f44d26b..cacb6f083ecb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -739,7 +739,7 @@ __acquires(kernel_lock)
 		return -1;
 	}
 
-	if (strlen(type->name) > MAX_TRACER_SIZE) {
+	if (strlen(type->name) >= MAX_TRACER_SIZE) {
 		pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
 		return -1;
 	}
-- 
cgit v1.2.3


From 592913ecb87a9e06f98ddb55b298f1a66bf94c6b Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:20 -0700
Subject: time: Kill off CONFIG_GENERIC_TIME

Now that all arches have been converted over to use generic time via
clocksources or arch_gettimeoffset(), we can remove the GENERIC_TIME
config option and simplify the generic code.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-4-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..7531ddaf3afe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
-	depends on GENERIC_TIME
+	depends on !ARCH_USES_GETTIMEOFFSET
 	select TRACE_IRQFLAGS
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
-	depends on GENERIC_TIME
+	depends on !ARCH_USES_GETTIMEOFFSET
 	depends on PREEMPT
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
-- 
cgit v1.2.3


From 669336e4cf3e1cb95800f3f5924558a76d723c21 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 20 Jul 2010 17:29:54 +0200
Subject: perf: Use tracepoint_synchronize_unregister() to flush any pending
 tracepoint call

We use synchronize_sched() to ensure a tracepoint won't be called
while/after we release the perf buffers it references.

But the tracepoint API has its own API for that:
tracepoint_synchronize_unregister(). Use it instead as it's
self-explanatory and eases maintainance.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace_event_perf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 23751659582e..000e6e85b445 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -131,10 +131,10 @@ void perf_trace_destroy(struct perf_event *p_event)
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
 
 	/*
-	 * Ensure our callback won't be called anymore. See
-	 * tracepoint_probe_unregister() and __DO_TRACE().
+	 * Ensure our callback won't be called anymore. The buffers
+	 * will be freed after that.
 	 */
-	synchronize_sched();
+	tracepoint_synchronize_unregister();
 
 	free_percpu(tp_event->perf_events);
 	tp_event->perf_events = NULL;
-- 
cgit v1.2.3


From 9da79ab83ee33ddc1fdd0858fd3d70925a1bde99 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 30 Jun 2010 14:15:48 +0530
Subject: tracing/kprobes: unregister_trace_probe needs to be called under
 mutex

Comment in unregister_trace_probe() says probe_lock will be held when it
gets called. However there is a case where it might called without the
probe_lock being held. Also since we are traversing the probe_list and
deleting an element from the probe_list, probe_lock should be held.

This was first pointed in uprobes traceevent review by Frederic
Weisbecker here.  (http://lkml.org/lkml/2010/5/12/106)

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20100630084548.GA10325@linux.vnet.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1b79d1c15726..8b27c9849b42 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -925,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
 			pr_info("Delete command needs an event name.\n");
 			return -EINVAL;
 		}
+		mutex_lock(&probe_lock);
 		tp = find_probe_event(event, group);
 		if (!tp) {
+			mutex_unlock(&probe_lock);
 			pr_info("Event %s/%s doesn't exist.\n", group, event);
 			return -ENOENT;
 		}
 		/* delete an event */
 		unregister_trace_probe(tp);
 		free_trace_probe(tp);
+		mutex_unlock(&probe_lock);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 955b61e597984745fb7d34c75708f6503b6aaeab Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:23 -0500
Subject: ftrace,kdb: Extend kdb to be able to dump the ftrace buffer

Add in a helper function to allow the kdb shell to dump the ftrace
buffer.

Modify trace.c to expose the capability to iterate over the ftrace
buffer in a read only capacity.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Makefile    |   3 ++
 kernel/trace/trace.c     |  43 ++++++++---------
 kernel/trace/trace.h     |  19 ++++++++
 kernel/trace/trace_kdb.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 163 insertions(+), 21 deletions(-)
 create mode 100644 kernel/trace/trace_kdb.c

(limited to 'kernel/trace')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..4215530b490b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -57,5 +57,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..d6736b93dc2a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
 	preempt_enable();
 }
 
-static cpumask_var_t __read_mostly	tracing_buffer_mask;
-
-#define for_each_tracing_cpu(cpu)	\
-	for_each_cpu(cpu, tracing_buffer_mask)
+cpumask_var_t __read_mostly	tracing_buffer_mask;
 
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -1539,11 +1536,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-enum trace_file_type {
-	TRACE_FILE_LAT_FMT	= 1,
-	TRACE_FILE_ANNOTATE	= 2,
-};
-
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1633,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
 {
 	iter->ent = __find_next_entry(iter, &iter->cpu,
 				      &iter->lost_events, &iter->ts);
@@ -1676,19 +1668,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 		return NULL;
 
 	if (iter->idx < 0)
-		ent = find_next_entry_inc(iter);
+		ent = trace_find_next_entry_inc(iter);
 	else
 		ent = iter;
 
 	while (ent && iter->idx < i)
-		ent = find_next_entry_inc(iter);
+		ent = trace_find_next_entry_inc(iter);
 
 	iter->pos = *pos;
 
 	return ent;
 }
 
-static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
 	struct trace_array *tr = iter->tr;
 	struct ring_buffer_event *event;
@@ -2049,7 +2041,7 @@ int trace_empty(struct trace_iterator *iter)
 }
 
 /*  Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
 
@@ -3211,7 +3203,7 @@ waitagain:
 
 	trace_event_read_lock();
 	trace_access_lock(iter->cpu_file);
-	while (find_next_entry_inc(iter) != NULL) {
+	while (trace_find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
 
@@ -3294,7 +3286,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 		if (ret != TRACE_TYPE_NO_CONSUME)
 			trace_consume(iter);
 		rem -= count;
-		if (!find_next_entry_inc(iter))	{
+		if (!trace_find_next_entry_inc(iter))	{
 			rem = 0;
 			iter->ent = NULL;
 			break;
@@ -3350,7 +3342,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	if (ret <= 0)
 		goto out_err;
 
-	if (!iter->ent && !find_next_entry_inc(iter)) {
+	if (!iter->ent && !trace_find_next_entry_inc(iter)) {
 		ret = -EFAULT;
 		goto out_err;
 	}
@@ -4414,7 +4406,7 @@ static struct notifier_block trace_die_notifier = {
  */
 #define KERN_TRACE		KERN_EMERG
 
-static void
+void
 trace_printk_seq(struct trace_seq *s)
 {
 	/* Probably should print a warning here. */
@@ -4429,6 +4421,13 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+	iter->tr = &global_trace;
+	iter->trace = current_trace;
+	iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
+
 static void
 __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
@@ -4454,8 +4453,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 	if (disable_tracing)
 		ftrace_kill();
 
+	trace_init_global_iter(&iter);
+
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&global_trace.data[cpu]->disabled);
+		atomic_inc(&iter.tr->data[cpu]->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4505,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		iter.iter_flags |= TRACE_FILE_LAT_FMT;
 		iter.pos = -1;
 
-		if (find_next_entry_inc(&iter) != NULL) {
+		if (trace_find_next_entry_inc(&iter) != NULL) {
 			int ret;
 
 			ret = print_trace_line(&iter);
@@ -4526,7 +4527,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		trace_flags |= old_userobj;
 
 		for_each_tracing_cpu(cpu) {
-			atomic_dec(&global_trace.data[cpu]->disabled);
+			atomic_dec(&iter.tr->data[cpu]->disabled);
 		}
 		tracing_on();
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..0605fc00c176 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -338,6 +338,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts);
 
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 
@@ -380,6 +388,15 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+	TRACE_FILE_ANNOTATE	= 2,
+};
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu(cpu, tracing_buffer_mask)
 
 extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 
@@ -471,6 +488,8 @@ trace_array_vprintk(struct trace_array *tr,
 		    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..44cbda25b0a5
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,119 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "../debug/kdb/kdb_private.h"
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines)
+{
+	/* use static because iter can be a bit big for the stack */
+	static struct trace_iterator iter;
+	unsigned int old_userobj;
+	int cnt = 0, cpu;
+
+	trace_init_global_iter(&iter);
+
+	for_each_tracing_cpu(cpu) {
+		atomic_inc(&iter.tr->data[cpu]->disabled);
+	}
+
+	old_userobj = trace_flags;
+
+	/* don't look at user memory in panic mode */
+	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+	kdb_printf("Dumping ftrace buffer:\n");
+
+	/* reset all but tr, trace, and overruns */
+	memset(&iter.seq, 0,
+		   sizeof(struct trace_iterator) -
+		   offsetof(struct trace_iterator, seq));
+	iter.iter_flags |= TRACE_FILE_LAT_FMT;
+	iter.pos = -1;
+
+	for_each_tracing_cpu(cpu) {
+		iter.buffer_iter[cpu] =
+			ring_buffer_read_prepare(iter.tr->buffer, cpu);
+		ring_buffer_read_start(iter.buffer_iter[cpu]);
+		tracing_iter_reset(&iter, cpu);
+	}
+
+	if (!trace_empty(&iter))
+		trace_find_next_entry_inc(&iter);
+	while (!trace_empty(&iter)) {
+		if (!cnt)
+			kdb_printf("---------------------------------\n");
+		cnt++;
+
+		if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+			print_trace_line(&iter);
+		if (!skip_lines)
+			trace_printk_seq(&iter.seq);
+		else
+			skip_lines--;
+		if (KDB_FLAG(CMD_INTERRUPT))
+			goto out;
+	}
+
+	if (!cnt)
+		kdb_printf("   (ftrace buffer empty)\n");
+	else
+		kdb_printf("---------------------------------\n");
+
+out:
+	trace_flags = old_userobj;
+
+	for_each_tracing_cpu(cpu) {
+		atomic_dec(&iter.tr->data[cpu]->disabled);
+	}
+
+	for_each_tracing_cpu(cpu)
+		if (iter.buffer_iter[cpu])
+			ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+	int skip_lines = 0;
+	char *cp;
+
+	if (argc > 1)
+		return KDB_ARGCOUNT;
+
+	if (argc) {
+		skip_lines = simple_strtol(argv[1], &cp, 0);
+		if (*cp)
+			skip_lines = 0;
+	}
+
+	kdb_trap_printk++;
+	ftrace_dump_buf(skip_lines);
+	kdb_trap_printk--;
+
+	return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+	kdb_register_repeat("ftdump", kdb_ftdump, "", "Dump ftrace log",
+			    0, KDB_REPEAT_NONE);
+	return 0;
+}
+
+late_initcall(kdb_ftrace_register);
-- 
cgit v1.2.3


From 19063c776fe745fab11216422cf56489ee83b452 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:23 -0500
Subject: ftrace,kdb: Allow dumping a specific cpu's buffer with ftdump

In systems with more than one processor it is desirable to look at the
per cpu trace buffers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kdb.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 44cbda25b0a5..7b8ecd751d93 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,7 +17,7 @@
 #include "trace.h"
 #include "trace_output.h"
 
-static void ftrace_dump_buf(int skip_lines)
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
 {
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
@@ -44,13 +44,20 @@ static void ftrace_dump_buf(int skip_lines)
 	iter.iter_flags |= TRACE_FILE_LAT_FMT;
 	iter.pos = -1;
 
-	for_each_tracing_cpu(cpu) {
-		iter.buffer_iter[cpu] =
+	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		for_each_tracing_cpu(cpu) {
+			iter.buffer_iter[cpu] =
 			ring_buffer_read_prepare(iter.tr->buffer, cpu);
-		ring_buffer_read_start(iter.buffer_iter[cpu]);
-		tracing_iter_reset(&iter, cpu);
+			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			tracing_iter_reset(&iter, cpu);
+		}
+	} else {
+		iter.cpu_file = cpu_file;
+		iter.buffer_iter[cpu_file] =
+			ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+		tracing_iter_reset(&iter, cpu_file);
 	}
-
 	if (!trace_empty(&iter))
 		trace_find_next_entry_inc(&iter);
 	while (!trace_empty(&iter)) {
@@ -91,9 +98,10 @@ out:
 static int kdb_ftdump(int argc, const char **argv)
 {
 	int skip_lines = 0;
+	long cpu_file;
 	char *cp;
 
-	if (argc > 1)
+	if (argc > 2)
 		return KDB_ARGCOUNT;
 
 	if (argc) {
@@ -102,8 +110,17 @@ static int kdb_ftdump(int argc, const char **argv)
 			skip_lines = 0;
 	}
 
+	if (argc == 2) {
+		cpu_file = simple_strtol(argv[2], &cp, 0);
+		if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+		    !cpu_online(cpu_file))
+			return KDB_BADINT;
+	} else {
+		cpu_file = TRACE_PIPE_ALL_CPU;
+	}
+
 	kdb_trap_printk++;
-	ftrace_dump_buf(skip_lines);
+	ftrace_dump_buf(skip_lines, cpu_file);
 	kdb_trap_printk--;
 
 	return 0;
@@ -111,8 +128,8 @@ static int kdb_ftdump(int argc, const char **argv)
 
 static __init int kdb_ftrace_register(void)
 {
-	kdb_register_repeat("ftdump", kdb_ftdump, "", "Dump ftrace log",
-			    0, KDB_REPEAT_NONE);
+	kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+			    "Dump ftrace log", 0, KDB_REPEAT_NONE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 575570f02761bd680ba5731c1dfd4701062e7fb2 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 27 Jul 2010 16:06:34 +0800
Subject: tracing: Fix an unallocated memory access in function_graph

With CONFIG_DEBUG_PAGEALLOC, I observed an unallocated memory access in
function_graph trace. It appears we find a small size entry in ring buffer,
but we access it as a big size entry. The access overflows the page size
and touches an unallocated page.

Cc: <stable@kernel.org>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <1280217994.32400.76.camel@sli10-desk.sh.intel.com>
[ Added a comment to explain the problem - SDR ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..b4c179ae4e45 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * if the output fails.
 			 */
 			data->ent = *curr;
-			data->ret = *next;
+			/*
+			 * If the next event is not a return type, then
+			 * we only care about what type it is. Otherwise we can
+			 * safely copy the entire event.
+			 */
+			if (next->ent.type == TRACE_GRAPH_RET)
+				data->ret = *next;
+			else
+				data->ret.ent.type = next->ent.type;
 		}
 	}
 
-- 
cgit v1.2.3


From 18fab912d4fa70133df164d2dcf3310be0c38c34 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 28 Jul 2010 14:14:01 +0800
Subject: tracing: Fix ring_buffer_read_page reading out of page boundary

With the configuration: CONFIG_DEBUG_PAGEALLOC=y and Shaohua's patch:

[PATCH]x86: make spurious_fault check correct pte bit

Function call graph trace with the following will trigger a page fault.

# cd /sys/kernel/debug/tracing/
# echo function_graph > current_tracer
# cat per_cpu/cpu1/trace_pipe_raw > /dev/null

BUG: unable to handle kernel paging request at ffff880006e99000
IP: [<ffffffff81085572>] rb_event_length+0x1/0x3f
PGD 1b19063 PUD 1b1d063 PMD 3f067 PTE 6e99160
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
last sysfs file: /sys/devices/virtual/net/lo/operstate
CPU 1
Modules linked in:

Pid: 1982, comm: cat Not tainted 2.6.35-rc6-aes+ #300 /Bochs
RIP: 0010:[<ffffffff81085572>]  [<ffffffff81085572>] rb_event_length+0x1/0x3f
RSP: 0018:ffff880006475e38  EFLAGS: 00010006
RAX: 0000000000000ff0 RBX: ffff88000786c630 RCX: 000000000000001d
RDX: ffff880006e98000 RSI: 0000000000000ff0 RDI: ffff880006e99000
RBP: ffff880006475eb8 R08: 000000145d7008bd R09: 0000000000000000
R10: 0000000000008000 R11: ffffffff815d9336 R12: ffff880006d08000
R13: ffff880006e605d8 R14: 0000000000000000 R15: 0000000000000018
FS:  00007f2b83e456f0(0000) GS:ffff880002100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffff880006e99000 CR3: 00000000064a8000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process cat (pid: 1982, threadinfo ffff880006474000, task ffff880006e40770)
Stack:
 ffff880006475eb8 ffffffff8108730f 0000000000000ff0 000000145d7008bd
<0> ffff880006e98010 ffff880006d08010 0000000000000296 ffff88000786c640
<0> ffffffff81002956 0000000000000000 ffff8800071f4680 ffff8800071f4680
Call Trace:
 [<ffffffff8108730f>] ? ring_buffer_read_page+0x15a/0x24a
 [<ffffffff81002956>] ? return_to_handler+0x15/0x2f
 [<ffffffff8108a575>] tracing_buffers_read+0xb9/0x164
 [<ffffffff810debfe>] vfs_read+0xaf/0x150
 [<ffffffff81002941>] return_to_handler+0x0/0x2f
 [<ffffffff810248b0>] __bad_area_nosemaphore+0x17e/0x1a1
 [<ffffffff81002941>] return_to_handler+0x0/0x2f
 [<ffffffff810248e6>] bad_area_nosemaphore+0x13/0x15
Code: 80 25 b2 16 b3 00 fe c9 c3 55 48 89 e5 f0 80 0d a4 16 b3 00 02 c9 c3 55 31 c0 48 89 e5 48 83 3d 94 16 b3 00 01 c9 0f 94 c0 c3 55 <8a> 0f 48 89 e5 83 e1 1f b8 08 00 00 00 0f b6 d1 83 fa 1e 74 27
RIP  [<ffffffff81085572>] rb_event_length+0x1/0x3f
 RSP <ffff880006475e38>
CR2: ffff880006e99000
---[ end trace a6877bb92ccb36bb ]---

The root cause is that ring_buffer_read_page() may read out of page
boundary, because the boundary checking is done after reading. This is
fixed via doing boundary checking before reading.

Reported-by: Shaohua Li <shaohua.li@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1280297641.2771.307.camel@yhuang-dev>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..5ec8f1d1480e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3868,6 +3868,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 			rpos = reader->read;
 			pos += size;
 
+			if (rpos >= commit)
+				break;
+
 			event = rb_reader_event(cpu_buffer);
 			size = rb_event_length(event);
 		} while (len > size);
-- 
cgit v1.2.3


From 33659ebbae262228eef4e0fe990f393d1f0ed941 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 7 Aug 2010 18:17:56 +0200
Subject: block: remove wrappers for request type/flags

Remove all the trivial wrappers for the cmd_type and cmd_flags fields in
struct requests.  This allows much easier grepping for different request
types instead of unwinding through macros.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..4f149944cb89 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -661,10 +661,10 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (blk_discard_rq(rq))
+	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= (1 << BIO_RW_DISCARD);
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
 				what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +925,7 @@ void blk_add_driver_data(struct request_queue *q,
 	if (likely(!bt))
 		return;
 
-	if (blk_pc_request(rq))
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
@@ -1730,7 +1730,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	int len = rq->cmd_len;
 	unsigned char *cmd = rq->cmd;
 
-	if (!blk_pc_request(rq)) {
+	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		buf[0] = '\0';
 		return;
 	}
@@ -1779,7 +1779,7 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	int rw = rq->cmd_flags & 0x03;
 	int bytes;
 
-	if (blk_discard_rq(rq))
+	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= (1 << BIO_RW_DISCARD);
 
 	bytes = blk_rq_bytes(rq);
-- 
cgit v1.2.3


From 7b6d91daee5cac6402186ff224c3af39d79f4a0e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 7 Aug 2010 18:20:39 +0200
Subject: block: unify flags for struct bio and struct request

Remove the current bio flags and reuse the request flags for the bio, too.
This allows to more easily trace the type of I/O from the filesystem
down to the block driver.  There were two flags in the bio that were
missing in the requests:  BIO_RW_UNPLUG and BIO_RW_AHEAD.  Also I've
renamed two request flags that had a superflous RW in them.

Note that the flags are in bio.h despite having the REQ_ name - as
blkdev.h includes bio.h that is the only way to go for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4f149944cb89..3b4a695051b6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
+#define BLK_TC_HARDBARRIER	BLK_TC_BARRIER
+#define BLK_TC_RAHEAD		BLK_TC_AHEAD
+
 /* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
-	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 
 	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNCIO);
-	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, HARDBARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
 
@@ -662,7 +665,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 		return;
 
 	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= (1 << BIO_RW_DISCARD);
+		rw |= REQ_DISCARD;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
@@ -1755,20 +1758,20 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 
 	if (rw & WRITE)
 		rwbs[i++] = 'W';
-	else if (rw & 1 << BIO_RW_DISCARD)
+	else if (rw & REQ_DISCARD)
 		rwbs[i++] = 'D';
 	else if (bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (rw & 1 << BIO_RW_AHEAD)
+	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & 1 << BIO_RW_BARRIER)
+	if (rw & REQ_HARDBARRIER)
 		rwbs[i++] = 'B';
-	if (rw & 1 << BIO_RW_SYNCIO)
+	if (rw & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (rw & 1 << BIO_RW_META)
+	if (rw & REQ_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
@@ -1780,7 +1783,7 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	int bytes;
 
 	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= (1 << BIO_RW_DISCARD);
+		rw |= REQ_DISCARD;
 
 	bytes = blk_rq_bytes(rq);
 
-- 
cgit v1.2.3


From 62c2a7d969f30163f733c81158254b3095b23e72 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Jul 2010 16:51:26 +0200
Subject: block: push BKL into blktrace ioctls

The blktrace driver currently needs the BKL, but
we should not need to take that in the block layer,
so just push it down into the driver itself.

It is quite likely that the BKL is not actually
required in blktrace code and could be removed
in a follow-on patch.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3b4a695051b6..82499a5bdcb7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -552,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+				  dev_t dev, struct block_device *bdev,
+				  char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct compat_blk_user_trace_setup cbuts;
+	int ret;
+
+	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+		return -EFAULT;
+
+	buts = (struct blk_user_trace_setup) {
+		.act_mask = cbuts.act_mask,
+		.buf_size = cbuts.buf_size,
+		.buf_nr = cbuts.buf_nr,
+		.start_lba = cbuts.start_lba,
+		.end_lba = cbuts.end_lba,
+		.pid = cbuts.pid,
+	};
+	memcpy(&buts.name, &cbuts.name, 32);
+
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts.name, 32)) {
+		blk_trace_remove(q);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+#endif
+
 int blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
@@ -604,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	if (!q)
 		return -ENXIO;
 
+	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 
 	switch (cmd) {
@@ -611,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 		bdevname(bdev, b);
 		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+	case BLKTRACESETUP32:
+		bdevname(bdev, b);
+		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		break;
+#endif
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
@@ -625,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	}
 
 	mutex_unlock(&bdev->bd_mutex);
+	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 426d31071ac476ea62c62656b242930c17b58c00 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Sat, 7 Aug 2010 12:30:03 +0200
Subject: fix printk typo 'faild'

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..58716e73e2a2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -490,7 +490,7 @@ static int register_trace_probe(struct trace_probe *tp)
 	}
 	ret = register_probe_event(tp);
 	if (ret) {
-		pr_warning("Faild to register probe event(%d)\n", ret);
+		pr_warning("Failed to register probe event(%d)\n", ret);
 		goto end;
 	}
 
-- 
cgit v1.2.3


From 8d57a98ccd0b4489003473979da8f5a1363ba7a3 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Wed, 11 Aug 2010 14:17:49 -0700
Subject: block: add secure discard

Secure discard is the same as discard except that all copies of the
discarded sectors (perhaps created by garbage collection) must also be
erased.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Madhusudhan Chikkature <madhu.cr@ti.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ben Gardiner <bengardiner@nanometrics.ca>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/blktrace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 82499a5bdcb7..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -710,6 +710,9 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= REQ_DISCARD;
 
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
@@ -1816,6 +1819,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
 		rwbs[i++] = 'M';
+	if (rw & REQ_SECURE)
+		rwbs[i++] = 'E';
 
 	rwbs[i] = '\0';
 }
@@ -1828,6 +1833,9 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= REQ_DISCARD;
 
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
+
 	bytes = blk_rq_bytes(rq);
 
 	blk_fill_rwbs(rwbs, rw, bytes);
-- 
cgit v1.2.3


From 2a37a3df57c44e947271758a1aa4bea7bff9feab Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Jun 2010 15:21:34 -0400
Subject: tracing/events: Convert format output to seq_file

Two new events were added that broke the current format output.

Both from the SCSI system: scsi_dispatch_cmd_done and scsi_dispatch_cmd_timeout

The reason is that their print_fmt exceeded a page size. Since the output
of the format used simple_read_from_buffer and trace_seq, it was limited
to a page size in output.

This patch converts the printing of the format of an event into seq_file,
which allows greater than a page size to be shown.

I diffed all event formats comparing the output with and without this
patch. All matched except for the above two, which showed just:

  FORMAT TOO BIG

without this patch, but now properly displays the output with this patch.

v2: Remove updating *pos in seq start function.
   [ Thanks to Li Zefan for pointing that out ]

Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Kei Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Cc: James Bottomley <James.Bottomley@suse.de>
Cc: Tomohiro Kusumi <kusumi.tomohiro@jp.fujitsu.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 208 ++++++++++++++++++++++++++++++--------------
 1 file changed, 141 insertions(+), 67 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..45a8968707aa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,6 +29,8 @@ DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
+#define COMMON_FIELD_COUNT	5
+
 struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
 {
@@ -544,85 +546,155 @@ out:
 	return ret;
 }
 
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
-		  loff_t *ppos)
+enum {
+	FORMAT_HEADER		= 1,
+	FORMAT_PRINTFMT		= 2,
+};
+
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = filp->private_data;
+	struct ftrace_event_call *call = m->private;
 	struct ftrace_event_field *field;
 	struct list_head *head;
-	struct trace_seq *s;
-	int common_field_count = 5;
-	char *buf;
-	int r = 0;
-
-	if (*ppos)
-		return 0;
+	loff_t index = *pos;
 
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
+	(*pos)++;
 
-	trace_seq_init(s);
+	head = trace_get_fields(call);
 
-	trace_seq_printf(s, "name: %s\n", call->name);
-	trace_seq_printf(s, "ID: %d\n", call->event.type);
-	trace_seq_printf(s, "format:\n");
+	switch ((unsigned long)v) {
+	case FORMAT_HEADER:
 
-	head = trace_get_fields(call);
-	list_for_each_entry_reverse(field, head, link) {
-		/*
-		 * Smartly shows the array type(except dynamic array).
-		 * Normal:
-		 *	field:TYPE VAR
-		 * If TYPE := TYPE[LEN], it is shown:
-		 *	field:TYPE VAR[LEN]
-		 */
-		const char *array_descriptor = strchr(field->type, '[');
-
-		if (!strncmp(field->type, "__data_loc", 10))
-			array_descriptor = NULL;
-
-		if (!array_descriptor) {
-			r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
-					"\tsize:%u;\tsigned:%d;\n",
-					field->type, field->name, field->offset,
-					field->size, !!field->is_signed);
-		} else {
-			r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
-					"\tsize:%u;\tsigned:%d;\n",
-					(int)(array_descriptor - field->type),
-					field->type, field->name,
-					array_descriptor, field->offset,
-					field->size, !!field->is_signed);
-		}
+		if (unlikely(list_empty(head)))
+			return NULL;
 
-		if (--common_field_count == 0)
-			r = trace_seq_printf(s, "\n");
+		field = list_entry(head->prev, struct ftrace_event_field, link);
+		return field;
 
-		if (!r)
-			break;
+	case FORMAT_PRINTFMT:
+		/* all done */
+		return NULL;
 	}
 
-	if (r)
-		r = trace_seq_printf(s, "\nprint fmt: %s\n",
-				call->print_fmt);
+	/*
+	 * To separate common fields from event fields, the
+	 * LSB is set on the first event field. Clear it in case.
+	 */
+	v = (void *)((unsigned long)v & ~1L);
 
-	if (!r) {
-		/*
-		 * ug!  The format output is bigger than a PAGE!!
-		 */
-		buf = "FORMAT TOO BIG\n";
-		r = simple_read_from_buffer(ubuf, cnt, ppos,
-					      buf, strlen(buf));
-		goto out;
+	field = v;
+	if (field->link.prev == head)
+		return (void *)FORMAT_PRINTFMT;
+
+	field = list_entry(field->link.prev, struct ftrace_event_field, link);
+
+	/* Set the LSB to notify f_show to print an extra newline */
+	if (index == COMMON_FIELD_COUNT)
+		field = (struct ftrace_event_field *)
+			((unsigned long)field | 1);
+
+	return field;
+}
+
+static void *f_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t l = 0;
+	void *p;
+
+	/* Start by showing the header */
+	if (!*pos)
+		return (void *)FORMAT_HEADER;
+
+	p = (void *)FORMAT_HEADER;
+	do {
+		p = f_next(m, p, &l);
+	} while (p && l < *pos);
+
+	return p;
+}
+
+static int f_show(struct seq_file *m, void *v)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_field *field;
+	const char *array_descriptor;
+
+	switch ((unsigned long)v) {
+	case FORMAT_HEADER:
+		seq_printf(m, "name: %s\n", call->name);
+		seq_printf(m, "ID: %d\n", call->event.type);
+		seq_printf(m, "format:\n");
+		return 0;
+
+	case FORMAT_PRINTFMT:
+		seq_printf(m, "\nprint fmt: %s\n",
+			   call->print_fmt);
+		return 0;
 	}
 
-	r = simple_read_from_buffer(ubuf, cnt, ppos,
-				    s->buffer, s->len);
- out:
-	kfree(s);
-	return r;
+	/*
+	 * To separate common fields from event fields, the
+	 * LSB is set on the first event field. Clear it and
+	 * print a newline if it is set.
+	 */
+	if ((unsigned long)v & 1) {
+		seq_putc(m, '\n');
+		v = (void *)((unsigned long)v & ~1L);
+	}
+
+	field = v;
+
+	/*
+	 * Smartly shows the array type(except dynamic array).
+	 * Normal:
+	 *	field:TYPE VAR
+	 * If TYPE := TYPE[LEN], it is shown:
+	 *	field:TYPE VAR[LEN]
+	 */
+	array_descriptor = strchr(field->type, '[');
+
+	if (!strncmp(field->type, "__data_loc", 10))
+		array_descriptor = NULL;
+
+	if (!array_descriptor)
+		seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+			   field->type, field->name, field->offset,
+			   field->size, !!field->is_signed);
+	else
+		seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+			   (int)(array_descriptor - field->type),
+			   field->type, field->name,
+			   array_descriptor, field->offset,
+			   field->size, !!field->is_signed);
+
+	return 0;
+}
+
+static void f_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations trace_format_seq_ops = {
+	.start		= f_start,
+	.next		= f_next,
+	.stop		= f_stop,
+	.show		= f_show,
+};
+
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_event_call *call = inode->i_private;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &trace_format_seq_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = call;
+
+	return 0;
 }
 
 static ssize_t
@@ -820,8 +892,10 @@ static const struct file_operations ftrace_enable_fops = {
 };
 
 static const struct file_operations ftrace_event_format_fops = {
-	.open = tracing_open_generic,
-	.read = event_format_read,
+	.open = trace_format_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
 };
 
 static const struct file_operations ftrace_event_id_fops = {
-- 
cgit v1.2.3


From 1aa54bca6ee0d07ebcafb8ca8074b624d80724aa Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 28 Jul 2010 01:18:01 +0200
Subject: tracing: Sanitize value returned from write(trace_marker, "...", len)

When userspace code writes non-new-line-terminated string to trace_marker
file, write handler appends new-line and returns number of bytes written
to trace buffer, so
write(fd, "abc", 3) will return 4

That's unexpected and unfortunately it confuses glibc's fprintf function.

Example:
int main() {
  fprintf(stderr, "abc");
  return 0;
}

$ gcc test.c -o test
$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
$ ./test 2>/sys/kernel/debug/tracing/trace_marker

results in infinite loop:
write(fd, "abc", 3) = 4
write(fd, "", 1) = 0
write(fd, "", 1) = 0
write(fd, "", 1) = 0
write(fd, "", 1) = 0
write(fd, "", 1) = 0
write(fd, "", 1) = 0
write(fd, "", 1) = 0
(...)

...and kernel trace buffer full of empty markers.

Fix it by sanitizing write return value.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
LKML-Reference: <20100727231801.GB2826@joi.lan>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..88b42d14d32d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3498,6 +3498,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
 {
 	char *buf;
+	size_t written;
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -3519,11 +3520,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	} else
 		buf[cnt] = '\0';
 
-	cnt = mark_printk("%s", buf);
+	written = mark_printk("%s", buf);
 	kfree(buf);
-	*fpos += cnt;
+	*fpos += written;
 
-	return cnt;
+	/* don't tell userspace we wrote more - it might confuse them */
+	if (written > cnt)
+		written = cnt;
+
+	return written;
 }
 
 static int tracing_clock_show(struct seq_file *m, void *v)
-- 
cgit v1.2.3


From 86397dc3ccfc0e17b7550d05eaf15fe91f6498dd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 13:53:06 +0800
Subject: tracing: Clean up seqfile code for format file

Remove the nasty hack that marks a pointer's LSB to distinguish common
fields from event fields. Replace it with a more sane approach.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4C6A23C2.9020606@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 55 +++++++++++++++------------------------------
 1 file changed, 18 insertions(+), 37 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
 
 enum {
 	FORMAT_HEADER		= 1,
-	FORMAT_PRINTFMT		= 2,
+	FORMAT_FIELD_SEPERATOR	= 2,
+	FORMAT_PRINTFMT		= 3,
 };
 
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_event_call *call = m->private;
 	struct ftrace_event_field *field;
-	struct list_head *head;
+	struct list_head *common_head = &ftrace_common_fields;
+	struct list_head *head = trace_get_fields(call);
 
 	(*pos)++;
 
 	switch ((unsigned long)v) {
 	case FORMAT_HEADER:
-		head = &ftrace_common_fields;
+		if (unlikely(list_empty(common_head)))
+			return NULL;
+
+		field = list_entry(common_head->prev,
+				   struct ftrace_event_field, link);
+		return field;
 
+	case FORMAT_FIELD_SEPERATOR:
 		if (unlikely(list_empty(head)))
 			return NULL;
 
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 		return NULL;
 	}
 
-	head = trace_get_fields(call);
-
-	/*
-	 * To separate common fields from event fields, the
-	 * LSB is set on the first event field. Clear it in case.
-	 */
-	v = (void *)((unsigned long)v & ~1L);
-
 	field = v;
-	/*
-	 * If this is a common field, and at the end of the list, then
-	 * continue with main list.
-	 */
-	if (field->link.prev == &ftrace_common_fields) {
-		if (unlikely(list_empty(head)))
-			return NULL;
-		field = list_entry(head->prev, struct ftrace_event_field, link);
-		/* Set the LSB to notify f_show to print an extra newline */
-		field = (struct ftrace_event_field *)
-			((unsigned long)field | 1);
-		return field;
-	}
-
-	/* If we are done tell f_show to print the format */
-	if (field->link.prev == head)
+	if (field->link.prev == common_head)
+		return (void *)FORMAT_FIELD_SEPERATOR;
+	else if (field->link.prev == head)
 		return (void *)FORMAT_PRINTFMT;
 
 	field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
 		seq_printf(m, "format:\n");
 		return 0;
 
+	case FORMAT_FIELD_SEPERATOR:
+		seq_putc(m, '\n');
+		return 0;
+
 	case FORMAT_PRINTFMT:
 		seq_printf(m, "\nprint fmt: %s\n",
 			   call->print_fmt);
 		return 0;
 	}
 
-	/*
-	 * To separate common fields from event fields, the
-	 * LSB is set on the first event field. Clear it and
-	 * print a newline if it is set.
-	 */
-	if ((unsigned long)v & 1) {
-		seq_putc(m, '\n');
-		v = (void *)((unsigned long)v & ~1L);
-	}
-
 	field = v;
 
 	/*
-- 
cgit v1.2.3


From 7ae07ea3a48d30689ee037cb136bc21f0b37d8ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 14 Aug 2010 20:45:13 +0200
Subject: perf: Humanize the number of contexts

Instead of hardcoding the number of contexts for the recursions
barriers, define a cpp constant to make the code more
self-explanatory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
---
 kernel/trace/trace_event_perf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 000e6e85b445..db2eae2efcf2 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-static char *perf_trace_buf[4];
+static char *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -45,7 +45,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 		char *buf;
 		int i;
 
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			buf = (char *)alloc_percpu(perf_trace_t);
 			if (!buf)
 				goto fail;
@@ -65,7 +65,7 @@ fail:
 	if (!total_ref_count) {
 		int i;
 
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
@@ -140,7 +140,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	tp_event->perf_events = NULL;
 
 	if (!--total_ref_count) {
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
-- 
cgit v1.2.3


From 6016ee13db518ab1cd0cbf43fc2ad5712021e338 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 11 Aug 2010 12:47:59 +0900
Subject: perf, tracing: add missing __percpu markups

ftrace_event_call->perf_events, perf_trace_buf,
fgraph_data->cpu_data and some local variables are percpu pointers
missing __percpu markups. Add them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <1281498479-28551-1-git-send-email-namhyung@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_event_perf.c      | 15 ++++++++-------
 kernel/trace/trace_functions_graph.c |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index db2eae2efcf2..92f5477a006a 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-static char *perf_trace_buf[PERF_NR_CONTEXTS];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int	total_ref_count;
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
-	struct hlist_head *list;
+	struct hlist_head __percpu *list;
 	int ret = -ENOMEM;
 	int cpu;
 
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 	tp_event->perf_events = list;
 
 	if (!total_ref_count) {
-		char *buf;
+		char __percpu *buf;
 		int i;
 
 		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-			buf = (char *)alloc_percpu(perf_trace_t);
+			buf = (char __percpu *)alloc_percpu(perf_trace_t);
 			if (!buf)
 				goto fail;
 
@@ -102,13 +102,14 @@ int perf_trace_init(struct perf_event *p_event)
 int perf_trace_enable(struct perf_event *p_event)
 {
 	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct hlist_head __percpu *pcpu_list;
 	struct hlist_head *list;
 
-	list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!list))
+	pcpu_list = tp_event->perf_events;
+	if (WARN_ON_ONCE(!pcpu_list))
 		return -EINVAL;
 
-	list = this_cpu_ptr(list);
+	list = this_cpu_ptr(pcpu_list);
 	hlist_add_head_rcu(&p_event->hlist_entry, list);
 
 	return 0;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6bff23625781..fcb5a542cd21 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -23,7 +23,7 @@ struct fgraph_cpu_data {
 };
 
 struct fgraph_data {
-	struct fgraph_cpu_data		*cpu_data;
+	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
 	struct ftrace_graph_ent_entry	ent;
-- 
cgit v1.2.3


From 151772dbfad4dbe81721e40f9b3d588ea77bb7aa Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 25 Aug 2010 11:32:38 +1000
Subject: tracing/trace_stack: Fix stack trace on ppc64

save_stack_trace() stores the instruction pointer, not the
function descriptor. On ppc64 the trace stack code currently
dereferences the instruction pointer and shows 8 bytes of
instructions in our backtraces:

 # cat /sys/kernel/debug/tracing/stack_trace
        Depth    Size   Location    (26 entries)
        -----    ----   --------
  0)     5424     112   0x6000000048000004
  1)     5312     160   0x60000000ebad01b0
  2)     5152     160   0x2c23000041c20030
  3)     4992     240   0x600000007c781b79
  4)     4752     160   0xe84100284800000c
  5)     4592     192   0x600000002fa30000
  6)     4400     256   0x7f1800347b7407e0
  7)     4144     208   0xe89f0108f87f0070
  8)     3936     272   0xe84100282fa30000

Since we aren't dealing with function descriptors, use %pS
instead of %pF to fix it:

 # cat /sys/kernel/debug/tracing/stack_trace
        Depth    Size   Location    (26 entries)
        -----    ----   --------
  0)     5424     112   ftrace_call+0x4/0x8
  1)     5312     160   .current_io_context+0x28/0x74
  2)     5152     160   .get_io_context+0x48/0xa0
  3)     4992     240   .cfq_set_request+0x94/0x4c4
  4)     4752     160   .elv_set_request+0x60/0x84
  5)     4592     192   .get_request+0x2d4/0x468
  6)     4400     256   .get_request_wait+0x7c/0x258
  7)     4144     208   .__make_request+0x49c/0x610
  8)     3936     272   .generic_make_request+0x390/0x434

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
LKML-Reference: <20100825013238.GE28360@kryten>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 056468eae7cf..a6b7e0e0f3eb 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 {
 	unsigned long addr = stack_dump_trace[i];
 
-	return seq_printf(m, "%pF\n", (void *)addr);
+	return seq_printf(m, "%pS\n", (void *)addr);
 }
 
 static void print_disabled(struct seq_file *m)
-- 
cgit v1.2.3


From 3aaba20f26f58843e8f20611e5c0b1c06954310f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 16:50:12 +0800
Subject: tracing: Fix a race in function profile

While we are reading trace_stat/functionX and someone just
disabled function_profile at that time, we can trigger this:

	divide error: 0000 [#1] PREEMPT SMP
	...
	EIP is at function_stat_show+0x90/0x230
	...

This fix just takes the ftrace_profile_lock and checks if
rec->counter is 0. If it's 0, we know the profile buffer
has been reset.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: stable@kernel.org
LKML-Reference: <4C723644.4040708@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d88ce9b9fb8..7cb1f45a1de1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
+	int ret = 0;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	static DEFINE_MUTEX(mutex);
 	static struct trace_seq s;
 	unsigned long long avg;
 	unsigned long long stddev;
 #endif
+	mutex_lock(&ftrace_profile_lock);
+
+	/* we raced with function_profile_reset() */
+	if (unlikely(rec->counter == 0)) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
 		do_div(stddev, (rec->counter - 1) * 1000);
 	}
 
-	mutex_lock(&mutex);
 	trace_seq_init(&s);
 	trace_print_graph_duration(rec->time, &s);
 	trace_seq_puts(&s, "    ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
 	trace_seq_puts(&s, "    ");
 	trace_print_graph_duration(stddev, &s);
 	trace_print_seq(m, &s);
-	mutex_unlock(&mutex);
 #endif
 	seq_putc(m, '\n');
+out:
+	mutex_unlock(&ftrace_profile_lock);
 
-	return 0;
+	return ret;
 }
 
 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
-- 
cgit v1.2.3


From f6195aa09e618d712f52bf4fa33b5293820eb93d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 1 Sep 2010 12:23:12 -0400
Subject: ring-buffer: Place duplicate expression into a single function

While discussing the strictness of the 80 character limit on the
Kernel Summit Discussion mailing list, I showed examples that I
broke that limit slightly with some algorithms. In discussing with
John Linville, what looked better, I realized that two of the
80 char breaking culprits were an identical expression.

As a clean up, this patch moves the identical expression into its
own helper function and that is used instead. As a side effect,
the offending code is now under the 80 character limit. :-)

This clean up code also changes the expression from

	(A - B) - C  to  A - (B + C)

This makes the code look a little nicer too.

Cc: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 19cccc3c3028..ef27017caa56 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return local_read(&cpu_buffer->entries) -
+		(local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
 /**
  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
  * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
-	ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-		- cpu_buffer->read;
 
-	return ret;
+	return rb_num_of_entries(cpu_buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		entries += (local_read(&cpu_buffer->entries) -
-			    local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
+		entries += rb_num_of_entries(cpu_buffer);
 	}
 
 	return entries;
-- 
cgit v1.2.3


From 09bfafac3e237415cc4b6adde49f9f28b3a42659 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Tue, 10 Aug 2010 19:32:37 +0100
Subject: ARM: 6314/1: ftrace: allow build without frame pointers on ARM

With a new enough GCC, ARM function tracing can be supported without the
need for frame pointers.  This is essential for Thumb-2 support, since
frame pointers aren't available then.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..6329d063b5e4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -121,7 +121,7 @@ if FTRACE
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
-	select FRAME_POINTER
+	select FRAME_POINTER if (!ARM_UNWIND)
 	select KALLSYMS
 	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
-- 
cgit v1.2.3


From b3bd3de66f60df4c9a2076e2886a622458929056 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 10 Aug 2010 14:17:51 -0700
Subject: gcc-4.6: kernel/*: Fix unused but set warnings

No real bugs I believe, just some dead code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: andi@firstfloor.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 19cccc3c3028..492197e2f86c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2985,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer *buffer;
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
 	unsigned length;
 
 	cpu_buffer = iter->cpu_buffer;
-	buffer = cpu_buffer->buffer;
 
 	/*
 	 * Check if we are at the end of the buffer.
-- 
cgit v1.2.3


From 9c55cb12c1c172e2d51e85fbb5a4796ca86b77e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 8 Sep 2010 11:20:37 -0400
Subject: tracing: Do not allow llseek to set_ftrace_filter

Reading the file set_ftrace_filter does three things.

1) shows whether or not filters are set for the function tracer
2) shows what functions are set for the function tracer
3) shows what triggers are set on any functions

3 is independent from 1 and 2.

The way this file currently works is that it is a state machine,
and as you read it, it may change state. But this assumption breaks
when you use lseek() on the file. The state machine gets out of sync
and the t_show() may use the wrong pointer and cause a kernel oops.

Luckily, this will only kill the app that does the lseek, but the app
dies while holding a mutex. This prevents anyone else from using the
set_ftrace_filter file (or any other function tracing file for that matter).

A real fix for this is to rewrite the code, but that is too much for
a -rc release or stable. This patch simply disables llseek on the
set_ftrace_filter() file for now, and we can do the proper fix for the
next major release.

Reported-by: Robert Swiecki <swiecki@google.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Eugene Teo <eugene@redhat.com>
Cc: vendor-sec@lst.de
Cc: <stable@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7cb1f45a1de1..83a16e9ee518 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2416,7 +2416,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = no_llseek,
 	.release = ftrace_filter_release,
 };
 
-- 
cgit v1.2.3


From 61a527362234ac3352a91ac67c50c6f7cd248eb1 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 27 Aug 2010 20:38:46 +0900
Subject: tracing/kprobe: Fix a memory leak in error case

Fix a memory leak which happens when a field name conflicts with others. In
error case, free_trace_probe() will free all arguments until nr_args, so this
increments nr_args the begining of the loop instead of the end.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20100827113846.22882.12670.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8b27c9849b42..0116c038b0bc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -992,6 +992,9 @@ static int create_trace_probe(int argc, char **argv)
 	/* parse arguments */
 	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+		/* Increment count for freeing args in error case */
+		tp->nr_args++;
+
 		/* Parse argument name */
 		arg = strchr(argv[i], '=');
 		if (arg)
@@ -1021,11 +1024,8 @@ static int create_trace_probe(int argc, char **argv)
 		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
 		if (ret) {
 			pr_info("Parse error at argument%d. (%d)\n", i, ret);
-			kfree(tp->args[i].name);
 			goto error;
 		}
-
-		tp->nr_args++;
 	}
 
 	ret = register_trace_probe(tp);
-- 
cgit v1.2.3


From aba91595cfcebd193425e20aabc407531526a1c5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 27 Aug 2010 20:39:06 +0900
Subject: tracing/kprobes: Fix handling of argument names

Set "argN" name for each argument automatically if it has no specified name.
Since dynamic trace event(kprobe_events) accepts special characters for its
argument, its format can show those special characters (e.g. '$', '%', '+').
However, perf can't parse those format because of the character (especially
'%') mess up the format.  This sets "argX" name for those arguments if user
omitted the argument names.

E.g.
 # echo 'p do_fork %ax IP=%ip $stack' > tracing/kprobe_events
 # cat tracing/kprobe_events
 p:kprobes/p_do_fork_0 do_fork arg1=%ax IP=%ip arg3=$stack

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20100827113906.22882.59312.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 0116c038b0bc..a39251ef1a7b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -997,15 +997,18 @@ static int create_trace_probe(int argc, char **argv)
 
 		/* Parse argument name */
 		arg = strchr(argv[i], '=');
-		if (arg)
+		if (arg) {
 			*arg++ = '\0';
-		else
+			tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+		} else {
 			arg = argv[i];
+			/* If argument name is omitted, set "argN" */
+			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+			tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+		}
 
-		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
 		if (!tp->args[i].name) {
-			pr_info("Failed to allocate argument%d name '%s'.\n",
-				i, argv[i]);
+			pr_info("Failed to allocate argument[%d] name.\n", i);
 			ret = -ENOMEM;
 			goto error;
 		}
@@ -1014,7 +1017,7 @@ static int create_trace_probe(int argc, char **argv)
 			*tmp = '_';	/* convert : to _ */
 
 		if (conflict_field_name(tp->args[i].name, tp->args, i)) {
-			pr_info("Argument%d name '%s' conflicts with "
+			pr_info("Argument[%d] name '%s' conflicts with "
 				"another field.\n", i, argv[i]);
 			ret = -EINVAL;
 			goto error;
@@ -1023,7 +1026,7 @@ static int create_trace_probe(int argc, char **argv)
 		/* Parse fetch argument */
 		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
 		if (ret) {
-			pr_info("Parse error at argument%d. (%d)\n", i, ret);
+			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
 			goto error;
 		}
 	}
-- 
cgit v1.2.3


From da34634fd39958725310d2c30c9b4543945f968b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 27 Aug 2010 20:39:12 +0900
Subject: tracing/kprobe: Fix handling of C-unlike argument names

Check the argument name whether it is invalid (not C-like symbol name). This
makes event format simple.

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20100827113912.22882.62313.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_kprobe.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a39251ef1a7b..544301d29dee 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -514,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
 				struct pt_regs *regs);
 
-/* Check the name is good for event/group */
-static int check_event_name(const char *name)
+/* Check the name is good for event/group/fields */
+static int is_good_name(const char *name)
 {
 	if (!isalpha(*name) && *name != '_')
 		return 0;
@@ -557,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
 	else
 		tp->rp.kp.pre_handler = kprobe_dispatcher;
 
-	if (!event || !check_event_name(event)) {
+	if (!event || !is_good_name(event)) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -567,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
 	if (!tp->call.name)
 		goto error;
 
-	if (!group || !check_event_name(group)) {
+	if (!group || !is_good_name(group)) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -883,7 +883,7 @@ static int create_trace_probe(int argc, char **argv)
 	int i, ret = 0;
 	int is_return = 0, is_delete = 0;
 	char *symbol = NULL, *event = NULL, *group = NULL;
-	char *arg, *tmp;
+	char *arg;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -1012,9 +1012,13 @@ static int create_trace_probe(int argc, char **argv)
 			ret = -ENOMEM;
 			goto error;
 		}
-		tmp = strchr(tp->args[i].name, ':');
-		if (tmp)
-			*tmp = '_';	/* convert : to _ */
+
+		if (!is_good_name(tp->args[i].name)) {
+			pr_info("Invalid argument[%d] name: %s\n",
+				i, tp->args[i].name);
+			ret = -EINVAL;
+			goto error;
+		}
 
 		if (conflict_field_name(tp->args[i].name, tp->args, i)) {
 			pr_info("Argument[%d] name '%s' conflicts with "
-- 
cgit v1.2.3


From 9cb627d5f38830ca19aa0dca52d1d3a633018bf7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Sep 2010 12:58:43 +0200
Subject: perf, trace: Fix module leak

Commit 1c024eca (perf, trace: Optimize tracepoints by using
per-tracepoint-per-cpu hlist to track events) caused a module
refcount leak.

Reported-And-Tested-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4C7E1F12.8030304@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_event_perf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 000e6e85b445..31cc4cb0dbf2 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -91,6 +91,8 @@ int perf_trace_init(struct perf_event *p_event)
 		    tp_event->class && tp_event->class->reg &&
 		    try_module_get(tp_event->mod)) {
 			ret = perf_trace_event_init(tp_event, p_event);
+			if (ret)
+				module_put(tp_event->mod);
 			break;
 		}
 	}
@@ -146,6 +148,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 		}
 	}
 out:
+	module_put(tp_event->mod);
 	mutex_unlock(&event_mutex);
 }
 
-- 
cgit v1.2.3


From a4eaf7f14675cb512d69f0c928055e73d0c6d252 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Jun 2010 14:37:10 +0200
Subject: perf: Rework the PMU methods

Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.

The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.

This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).

It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).

The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:

 1) We disable the counter:
    a) the pmu has per-counter enable bits, we flip that
    b) we program a NOP event, preserving the counter state

 2) We store the counter state and ignore all read/overflow events

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_event_perf.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index f3bbcd1c90c8..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -101,7 +101,7 @@ int perf_trace_init(struct perf_event *p_event)
 	return ret;
 }
 
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct ftrace_event_call *tp_event = p_event->tp_event;
 	struct hlist_head __percpu *pcpu_list;
@@ -111,13 +111,16 @@ int perf_trace_enable(struct perf_event *p_event)
 	if (WARN_ON_ONCE(!pcpu_list))
 		return -EINVAL;
 
+	if (!(flags & PERF_EF_START))
+		p_event->hw.state = PERF_HES_STOPPED;
+
 	list = this_cpu_ptr(pcpu_list);
 	hlist_add_head_rcu(&p_event->hlist_entry, list);
 
 	return 0;
 }
 
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	hlist_del_rcu(&p_event->hlist_entry);
 }
-- 
cgit v1.2.3


From df09162550fbb53354f0c88e85b5d0e6129ee9cc Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 9 Sep 2010 16:34:59 -0700
Subject: tracing: t_start: reset FTRACE_ITER_HASH in case of seek/pread

Be sure to avoid entering t_show() with FTRACE_ITER_HASH set without
having properly started the iterator to iterate the hash.  This case is
degenerate and, as discovered by Robert Swiecki, can cause t_hash_show()
to misuse a pointer.  This causes a NULL ptr deref with possible security
implications.  Tracked as CVE-2010-3079.

Cc: Robert Swiecki <swiecki@google.com>
Cc: Eugene Teo <eugene@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83a16e9ee518..fa7ece649fe1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1510,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 		if (*pos > 0)
 			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
+		/* reset in case of seek/pread */
+		iter->flags &= ~FTRACE_ITER_HASH;
 		return iter;
 	}
 
-- 
cgit v1.2.3


From 2bccfffd1538f3523847583213567e2f7ce00926 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 9 Sep 2010 08:43:22 -0400
Subject: tracing: Do not reset *pos in set_ftrace_filter

After the filtered functions are read, the probed functions are read
from the hash in set_ftrace_filter. When the hashed probed functions
are read, the *pos passed in is reset. Instead of modifying the pos
given to the read function, just record the pos where the filtered
functions ended and subtract from that.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..585ea27025b1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1368,6 +1368,7 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
+	loff_t			func_pos;
 	struct ftrace_page	*pg;
 	int			hidx;
 	int			idx;
@@ -1418,12 +1419,15 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	loff_t l;
 
 	if (!(iter->flags & FTRACE_ITER_HASH))
-		*pos = 0;
+		iter->func_pos = *pos;
+
+	if (iter->func_pos > *pos)
+		return NULL;
 
 	iter->flags |= FTRACE_ITER_HASH;
 
 	iter->hidx = 0;
-	for (l = 0; l <= *pos; ) {
+	for (l = 0; l <= (*pos - iter->func_pos); ) {
 		p = t_hash_next(m, p, &l);
 		if (!p)
 			break;
-- 
cgit v1.2.3


From 4aeb69672d011fac5c8df671f3ca89f7987c104e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 9 Sep 2010 10:00:28 -0400
Subject: tracing: Replace typecasted void pointer in set_ftrace_filter code

The set_ftrace_filter uses seq_file and reads from two lists. The
pointer returned by t_next() can either be of type struct dyn_ftrace
or struct ftrace_func_probe. If there is a bug (there was one)
the wrong pointer may be used and the reference can cause an oops.

This patch makes t_next() and friends only return the iterator structure
which now has a pointer of type struct dyn_ftrace and struct
ftrace_func_probe. The t_show() can now test if the pointer is NULL or
not and if the pointer exists, it is guaranteed to be of the correct type.

Now if there's a bug, only wrong data will be shown but not an oops.

Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 67 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 21 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 585ea27025b1..c8db0dbb984e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1368,25 +1368,29 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
-	loff_t			func_pos;
-	struct ftrace_page	*pg;
-	int			hidx;
-	int			idx;
-	unsigned		flags;
-	struct trace_parser	parser;
+	loff_t				func_pos;
+	struct ftrace_page		*pg;
+	struct dyn_ftrace		*func;
+	struct ftrace_func_probe	*probe;
+	struct trace_parser		parser;
+	int				hidx;
+	int				idx;
+	unsigned			flags;
 };
 
 static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct hlist_node *hnd = v;
+	struct hlist_node *hnd = NULL;
 	struct hlist_head *hhd;
 
 	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
 
 	(*pos)++;
 
+	if (iter->probe)
+		hnd = &iter->probe->node;
  retry:
 	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
 		return NULL;
@@ -1409,7 +1413,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	}
 
-	return hnd;
+	if (WARN_ON_ONCE(!hnd))
+		return NULL;
+
+	iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+	return iter;
 }
 
 static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1428,19 +1437,24 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 	iter->hidx = 0;
 	for (l = 0; l <= (*pos - iter->func_pos); ) {
-		p = t_hash_next(m, p, &l);
+		p = t_hash_next(m, &l);
 		if (!p)
 			break;
 	}
-	return p;
+	if (!p)
+		return NULL;
+
+	return iter;
 }
 
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
 	struct ftrace_func_probe *rec;
-	struct hlist_node *hnd = v;
 
-	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+	rec = iter->probe;
+	if (WARN_ON_ONCE(!rec))
+		return -EIO;
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1461,7 +1475,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	struct dyn_ftrace *rec = NULL;
 
 	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_next(m, v, pos);
+		return t_hash_next(m, pos);
 
 	(*pos)++;
 
@@ -1495,7 +1509,12 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	}
 
-	return rec;
+	if (!rec)
+		return NULL;
+
+	iter->func = rec;
+
+	return iter;
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1530,10 +1549,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 			break;
 	}
 
-	if (!p && iter->flags & FTRACE_ITER_FILTER)
-		return t_hash_start(m, pos);
+	if (!p) {
+		if (iter->flags & FTRACE_ITER_FILTER)
+			return t_hash_start(m, pos);
 
-	return p;
+		return NULL;
+	}
+
+	return iter;
 }
 
 static void t_stop(struct seq_file *m, void *p)
@@ -1544,16 +1567,18 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct dyn_ftrace *rec = v;
+	struct dyn_ftrace *rec;
 
 	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_show(m, v);
+		return t_hash_show(m, iter);
 
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		seq_printf(m, "#### all functions enabled ####\n");
 		return 0;
 	}
 
+	rec = iter->func;
+
 	if (!rec)
 		return 0;
 
-- 
cgit v1.2.3


From 98c4fd046f07156ca6055677e8f03d4280be16c1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Sep 2010 11:47:43 -0400
Subject: tracing: Keep track of set_ftrace_filter position and allow lseek
 again

This patch keeps track of the index within the elements of
set_ftrace_filter and if the position goes backwards, it nicely
resets and starts from the beginning again.

This allows for lseek and pread to work properly now.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c8db0dbb984e..2d51166b93fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1368,6 +1368,7 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
+	loff_t				pos;
 	loff_t				func_pos;
 	struct ftrace_page		*pg;
 	struct dyn_ftrace		*func;
@@ -1385,9 +1386,8 @@ t_hash_next(struct seq_file *m, loff_t *pos)
 	struct hlist_node *hnd = NULL;
 	struct hlist_head *hhd;
 
-	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
-
 	(*pos)++;
+	iter->pos = *pos;
 
 	if (iter->probe)
 		hnd = &iter->probe->node;
@@ -1427,14 +1427,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	void *p = NULL;
 	loff_t l;
 
-	if (!(iter->flags & FTRACE_ITER_HASH))
-		iter->func_pos = *pos;
-
 	if (iter->func_pos > *pos)
 		return NULL;
 
-	iter->flags |= FTRACE_ITER_HASH;
-
 	iter->hidx = 0;
 	for (l = 0; l <= (*pos - iter->func_pos); ) {
 		p = t_hash_next(m, &l);
@@ -1444,6 +1439,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	if (!p)
 		return NULL;
 
+	/* Only set this if we have an item */
+	iter->flags |= FTRACE_ITER_HASH;
+
 	return iter;
 }
 
@@ -1478,6 +1476,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		return t_hash_next(m, pos);
 
 	(*pos)++;
+	iter->pos = *pos;
+	iter->func_pos = *pos;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
@@ -1517,6 +1517,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	return iter;
 }
 
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+	iter->pos = 0;
+	iter->func_pos = 0;
+	iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
+}
+
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
@@ -1524,6 +1531,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	loff_t l;
 
 	mutex_lock(&ftrace_lock);
+	/*
+	 * If an lseek was done, then reset and start from beginning.
+	 */
+	if (*pos < iter->pos)
+		reset_iter_read(iter);
+
 	/*
 	 * For set_ftrace_filter reading, if we have the filter
 	 * off, we can short cut and just print out that all
@@ -1541,6 +1554,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_HASH)
 		return t_hash_start(m, pos);
 
+	/*
+	 * Unfortunately, we need to restart at ftrace_pages_start
+	 * every time we let go of the ftrace_mutex. This is because
+	 * those pointers can change without the lock.
+	 */
 	iter->pg = ftrace_pages_start;
 	iter->idx = 0;
 	for (l = 0; l <= *pos; ) {
@@ -2447,7 +2465,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = no_llseek,
+	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
 };
 
-- 
cgit v1.2.3


From 57c072c7113f54f9512624d6c665db6184448782 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Sep 2010 11:21:11 -0400
Subject: tracing: Fix reading of set_ftrace_filter across lists

If we do:

 # cd /sys/kernel/debug
 # echo 'do_IRQ:traceon schedule:traceon sys_write:traceon' > \
    set_ftrace_filter
 # cat set_ftrace_filter

We get the following output:

 #### all functions enabled ####
 sys_write:traceon:unlimited
 schedule:traceon:unlimited
 do_IRQ:traceon:unlimited

This outputs two lists. One is the fact that all functions are
currently enabled for function tracing, the other has three probed
functions, which happen to have 'traceon' as their commands.

Currently, when reading the first list (functions enabled) the
seq_file code will receive a "NULL" from the t_next() function
causing it to exit early. This makes "read()" from userspace stop
reading the code at this boarder. Although read is allowed to do this,
some (broken) applications might consider this an end of file and
stop early.

This patch adds the start of the second list to t_next() when it
finishes the first list. It is a simple change and gives the
set_ftrace_filter file nicer reading ability.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2d51166b93fe..1884cf5bc110 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1477,10 +1477,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 	iter->pos = *pos;
-	iter->func_pos = *pos;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
-		return NULL;
+		return t_hash_start(m, pos);
 
  retry:
 	if (iter->idx >= iter->pg->index) {
@@ -1510,8 +1509,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	}
 
 	if (!rec)
-		return NULL;
+		return t_hash_start(m, pos);
 
+	iter->func_pos = *pos;
 	iter->func = rec;
 
 	return iter;
-- 
cgit v1.2.3


From 2bd16212b8eb86f9574e78d6605a5ba9e9aa8c4e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 7 Sep 2010 16:53:44 +0200
Subject: tracing: Add funcgraph-irq option for function graph tracer.

It's handy to be able to disable the irq related output
and not to have to jump over each irq related code, when
you have no interrest in it.

The option is by default enabled, so there's no change to
current behaviour. It affects only the final output, so all
the irq related data stay in the ring buffer.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <20100907145344.GC1912@jolsa.brq.redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 101 ++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c93bcb248638..8674750a5ece 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@
 struct fgraph_cpu_data {
 	pid_t		last_pid;
 	int		depth;
+	int		depth_irq;
 	int		ignore;
 	unsigned long	enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
@@ -41,6 +42,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_PROC		0x8
 #define TRACE_GRAPH_PRINT_DURATION	0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME	0x20
+#define TRACE_GRAPH_PRINT_IRQS		0x40
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -55,13 +57,15 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
 	/* Display absolute time of an entry */
 	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+	/* Display interrupts */
+	{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
 	/* Don't display overruns and proc by default */
 	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-	       TRACE_GRAPH_PRINT_DURATION,
+	       TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
 	.opts = trace_opts
 };
 
@@ -855,6 +859,92 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 	return 0;
 }
 
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+		unsigned long addr, int depth)
+{
+	int cpu = iter->cpu;
+	struct fgraph_data *data = iter->private;
+	int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+	if (flags & TRACE_GRAPH_PRINT_IRQS)
+		return 0;
+
+	/*
+	 * We are inside the irq code
+	 */
+	if (*depth_irq >= 0)
+		return 1;
+
+	if ((addr < (unsigned long)__irqentry_text_start) ||
+	    (addr >= (unsigned long)__irqentry_text_end))
+		return 0;
+
+	/*
+	 * We are entering irq code.
+	 */
+	*depth_irq = depth;
+	return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+	int cpu = iter->cpu;
+	struct fgraph_data *data = iter->private;
+	int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+	if (flags & TRACE_GRAPH_PRINT_IRQS)
+		return 0;
+
+	/*
+	 * We are not inside the irq code.
+	 */
+	if (*depth_irq == -1)
+		return 0;
+
+	/*
+	 * We are inside the irq code, and this is returning entry.
+	 * Let's not trace it and clear the entry depth, since
+	 * we are out of irq code.
+	 *
+	 * This condition ensures that we 'leave the irq code' once
+	 * we are out of the entry depth. Thus protecting us from
+	 * the RETURN entry loss.
+	 */
+	if (*depth_irq >= depth) {
+		*depth_irq = -1;
+		return 1;
+	}
+
+	/*
+	 * We are inside the irq code, and this is not the entry.
+	 */
+	return 1;
+}
+
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			struct trace_iterator *iter, u32 flags)
@@ -865,6 +955,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	static enum print_line_t ret;
 	int cpu = iter->cpu;
 
+	if (check_irq_entry(iter, flags, call->func, call->depth))
+		return TRACE_TYPE_HANDLED;
+
 	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -902,6 +995,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	int i;
 
+	if (check_irq_return(iter, flags, trace->depth))
+		return TRACE_TYPE_HANDLED;
+
 	if (data) {
 		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
@@ -1210,9 +1306,12 @@ void graph_trace_open(struct trace_iterator *iter)
 		pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
 		int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
 		int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+		int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
 		*pid = -1;
 		*depth = 0;
 		*ignore = 0;
+		*depth_irq = -1;
 	}
 
 	iter->private = data;
-- 
cgit v1.2.3


From b304d0441a4118fadd4c3f16e4dc600c271030b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Sep 2010 18:58:33 -0400
Subject: tracing: Do not trace in irq when funcgraph-irq option is zero

When the function graph tracer funcgraph-irq option is zero, disable
tracing in IRQs. This makes the option have two effects.

1) When reading the trace file, do not display the functions that
   happen in interrupt context (when detected)

2) [*new*] When recording a trace, skip those that are detected
   to be in interrupt by the 'in_irq()' function

Note, in_irq() is updated at irq_enter() and irq_exit(). There are
still functions that are recorded by the function graph tracer that
is in interrupt context but outside the irq_enter/exit() routines.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8674750a5ece..02c708ae0d42 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,9 @@
 #include "trace.h"
 #include "trace_output.h"
 
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
 struct fgraph_cpu_data {
 	pid_t		last_pid;
 	int		depth;
@@ -208,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
 	return 1;
 }
 
+static inline int ftrace_graph_ignore_irqs(void)
+{
+	if (!ftrace_graph_skip_irqs)
+		return 0;
+
+	return in_irq();
+}
+
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = graph_array;
@@ -222,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 		return 0;
 
 	/* trace it when it is-nested-in or is a function enabled. */
-	if (!(trace->depth || ftrace_graph_addr(trace->func)))
+	if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+	      ftrace_graph_ignore_irqs())
 		return 0;
 
 	local_irq_save(flags);
@@ -1334,6 +1346,14 @@ void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_GRAPH_PRINT_IRQS)
+		ftrace_graph_skip_irqs = !set;
+
+	return 0;
+}
+
 static struct trace_event_functions graph_functions = {
 	.trace		= print_graph_function_event,
 };
@@ -1360,6 +1380,7 @@ static struct tracer graph_trace __read_mostly = {
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
+	.set_flag	= func_graph_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_function_graph,
 #endif
-- 
cgit v1.2.3


From 79e406d7b00ab2b261ae32a59f266fd3b7af6f29 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Sep 2010 22:19:46 -0400
Subject: tracing: Remove leftover FTRACE_ENABLE/DISABLE_MCOUNT enums

The enums for FTRACE_ENABLE_MCOUNT and FTRACE_DISABLE_MCOUNT were
used as commands to ftrace_run_update_code(). But these commands
were used by the old nasty ftrace daemon that has long been slain.

This is a clean up patch to remove the references to these enums
and simplify the code a little.

Reported-by: Wu Zhangjin <wuzhangjin@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83a16e9ee518..20aff3f1c719 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+	FTRACE_START_FUNC_RET		= (1 << 3),
+	FTRACE_STOP_FUNC_RET		= (1 << 4),
 };
 
 static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
 
 static void ftrace_startup_sysctl(void)
 {
-	int command = FTRACE_ENABLE_MCOUNT;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
 	if (ftrace_start_up)
-		command |= FTRACE_ENABLE_CALLS;
-
-	ftrace_run_update_code(command);
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 }
 
 static void ftrace_shutdown_sysctl(void)
 {
-	int command = FTRACE_DISABLE_MCOUNT;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
 	/* ftrace_start_up is true if ftrace is running */
 	if (ftrace_start_up)
-		command |= FTRACE_DISABLE_CALLS;
-
-	ftrace_run_update_code(command);
+		ftrace_run_update_code(FTRACE_DISABLE_CALLS);
 }
 
 static cycle_t		ftrace_update_time;
-- 
cgit v1.2.3


From e0cf0cd49632552f063fb3ae58691946da45fb2e Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:09:04 -0400
Subject: jump label: Initialize workqueue tracepoints *before* they are
 registered

Initialize the workqueue data structures *before* they are registered
so that they are ready for callbacks.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <e3a3383fc370ac7086625bebe89d9480d7caf372.1284733808.git.jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_workqueue.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
 {
 	int ret, cpu;
 
+	for_each_possible_cpu(cpu) {
+		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+	}
+
 	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 	if (ret)
 		goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
 	if (ret)
 		goto no_creation;
 
-	for_each_possible_cpu(cpu) {
-		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-	}
-
 	return 0;
 
 no_creation:
-- 
cgit v1.2.3


From d01343244abdedd18303d0323b518ed9cdcb1988 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 12 Oct 2010 12:06:43 -0400
Subject: ring-buffer: Fix typo of time extends per page

Time stamps for the ring buffer are created by the difference between
two events. Each page of the ring buffer holds a full 64 bit timestamp.
Each event has a 27 bit delta stamp from the last event. The unit of time
is nanoseconds, so 27 bits can hold ~134 milliseconds. If two events
happen more than 134 milliseconds apart, a time extend is inserted
to add more bits for the delta. The time extend has 59 bits, which
is good for ~18 years.

Currently the time extend is committed separately from the event.
If an event is discarded before it is committed, due to filtering,
the time extend still exists. If all events are being filtered, then
after ~134 milliseconds a new time extend will be added to the buffer.

This can only happen till the end of the page. Since each page holds
a full timestamp, there is no reason to add a time extend to the
beginning of a page. Time extends can only fill a page that has actual
data at the beginning, so there is no fear that time extends will fill
more than a page without any data.

When reading an event, a loop is made to skip over time extends
since they are only used to maintain the time stamp and are never
given to the caller. As a paranoid check to prevent the loop running
forever, with the knowledge that time extends may only fill a page,
a check is made that tests the iteration of the loop, and if the
iteration is more than the number of time extends that can fit in a page
a warning is printed and the ring buffer is disabled (all of ftrace
is also disabled with it).

There is another event type that is called a TIMESTAMP which can
hold 64 bits of data in the theoretical case that two events happen
18 years apart. This code has not been implemented, but the name
of this event exists, as well as the structure for it. The
size of a TIMESTAMP is 16 bytes, where as a time extend is only
8 bytes. The macro used to calculate how many time extends can fit on
a page used the TIMESTAMP size instead of the time extend size
cutting the amount in half.

The following test case can easily trigger the warning since we only
need to have half the page filled with time extends to trigger the
warning:

 # cd /sys/kernel/debug/tracing/
 # echo function > current_tracer
 # echo 'common_pid < 0' > events/ftrace/function/filter
 # echo > trace
 # echo 1 > trace_marker
 # sleep 120
 # cat trace

Enabling the function tracer and then setting the filter to only trace
functions where the process id is negative (no events), then clearing
the trace buffer to ensure that we have nothing in the buffer,
then write to trace_marker to add an event to the beginning of a page,
sleep for 2 minutes (only 35 seconds is probably needed, but this
guarantees the bug), and then finally reading the trace which will
trigger the bug.

This patch fixes the typo and prevents the false positive of that warning.

Reported-by: Hans J. Koch <hjk@linutronix.de>
Tested-by: Hans J. Koch <hjk@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stable Kernel <stable@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 492197e2f86c..bca96377fd4e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
 /* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
 
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
-- 
cgit v1.2.3


From 14cae9bd2faf6d0d75702c2e107e75207bcdfec1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 29 Sep 2010 10:08:23 +0200
Subject: tracing: Fix function-graph build warning on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix

kernel/trace/trace_functions_graph.c: In function ‘trace_print_graph_duration’:
kernel/trace/trace_functions_graph.c:652: warning: comparison of distinct pointer types lacks a cast

when building 36-rc6 on a 32-bit due to the strict type check failing
in the min() macro.

Signed-off-by: Borislav Petkov <bp@alien8.de>
Cc: Chase Douglas <chase.douglas@canonical.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100929080823.GA13595@liondog.tnic>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_functions_graph.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 02c708ae0d42..ef49e9370b25 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -665,8 +665,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 	/* Print nsecs (we don't want to exceed 7 numbers) */
 	if (len < 7) {
-		snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
-			 nsecs_rem);
+		size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+
+		snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
 		ret = trace_seq_printf(s, ".%s", nsecs_str);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3


From 72441cb1fd77d092f09ddfac748955703884c9a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 13 Oct 2010 17:12:30 -0400
Subject: ftrace/x86: Add support for C version of recordmcount

This patch adds the support for the C version of recordmcount and
compile times show ~ 12% improvement.

After verifying this works, other archs can add:

 HAVE_C_MCOUNT_RECORD

in its Kconfig and it will use the C version of recordmcount
instead of the perl version.

Cc: <linux-arch@vger.kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: linux-kbuild@vger.kernel.org
Cc: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..df00fbbaf609 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
 	help
 	  See Documentation/trace/ftrace-design.txt
 
+config HAVE_C_MCOUNT_RECORD
+	bool
+	help
+	  C version of recordmcount available?
+
 config TRACER_MAX_TRACE
 	bool
 
-- 
cgit v1.2.3


From cf4db2597ae93b60efc0a7a4ec08690b75d629b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 Oct 2010 23:32:44 -0400
Subject: ftrace: Rename config option HAVE_C_MCOUNT_RECORD to
 HAVE_C_RECORDMCOUNT

The config option used by archs to let the build system know that
the C version of the recordmcount works for said arch is currently
called HAVE_C_MCOUNT_RECORD which enables BUILD_C_RECORDMCOUNT. To
be more consistent with the name that all archs may use, it has been
renamed to HAVE_C_RECORDMCOUNT. This will be less confusing since
we are building a C recordmcount and not a mcount_record.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: <linux-arch@vger.kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: linux-kbuild@vger.kernel.org
Cc: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index df00fbbaf609..e550d2eda1df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,7 +49,7 @@ config HAVE_SYSCALL_TRACEPOINTS
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_C_MCOUNT_RECORD
+config HAVE_C_RECORDMCOUNT
 	bool
 	help
 	  C version of recordmcount available?
-- 
cgit v1.2.3


From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 15 Aug 2010 18:52:59 +0200
Subject: llseek: automatically add .llseek fop

All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.

The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.

New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time.  Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.

The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.

Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.

Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.

===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
//   but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}

@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}

@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
   *off = E
|
   *off += E
|
   func(..., off, ...)
|
   E = *off
)
...+>
}

@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}

@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
  *off = E
|
  *off += E
|
  func(..., off, ...)
|
  E = *off
)
...+>
}

@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}

@ fops0 @
identifier fops;
@@
struct file_operations fops = {
 ...
};

@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
 .llseek = llseek_f,
...
};

@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
 .read = read_f,
...
};

@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
 .write = write_f,
...
};

@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
 .open = open_f,
...
};

// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
...  .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};

@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
...  .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};

// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
...  .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};

// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};

// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};

@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+	.llseek = default_llseek, /* write accesses f_pos */
};

// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////

@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
 .write = write_f,
 .read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};

@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};

@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};

@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
---
 kernel/trace/blktrace.c     | 2 ++
 kernel/trace/ftrace.c       | 2 ++
 kernel/trace/ring_buffer.c  | 1 +
 kernel/trace/trace_events.c | 6 ++++++
 kernel/trace/trace_stack.c  | 1 +
 5 files changed, 12 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..2d5f3a757316 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -326,6 +326,7 @@ static const struct file_operations blk_dropped_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_dropped_open,
 	.read =		blk_dropped_read,
+	.llseek =	default_llseek,
 };
 
 static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +366,7 @@ static const struct file_operations blk_msg_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_msg_open,
 	.write =	blk_msg_write,
+	.llseek =	noop_llseek,
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..5e1ad4763090 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
 	.write		= ftrace_profile_write,
+	.llseek		= default_llseek,
 };
 
 /* used to initialize the real stat files */
@@ -2632,6 +2633,7 @@ static const struct file_operations ftrace_graph_fops = {
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
 	.release	= ftrace_graph_release,
+	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 492197e2f86c..3aea966d16de 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3965,6 +3965,7 @@ static const struct file_operations rb_simple_fops = {
 	.open		= tracing_open_generic,
 	.read		= rb_simple_read,
 	.write		= rb_simple_write,
+	.llseek		= default_llseek,
 };
 
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..0369c5e09984 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -951,6 +951,7 @@ static const struct file_operations ftrace_enable_fops = {
 	.open = tracing_open_generic,
 	.read = event_enable_read,
 	.write = event_enable_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_event_format_fops = {
@@ -963,29 +964,34 @@ static const struct file_operations ftrace_event_format_fops = {
 static const struct file_operations ftrace_event_id_fops = {
 	.open = tracing_open_generic,
 	.read = event_id_read,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_event_filter_fops = {
 	.open = tracing_open_generic,
 	.read = event_filter_read,
 	.write = event_filter_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_subsystem_filter_fops = {
 	.open = tracing_open_generic,
 	.read = subsystem_filter_read,
 	.write = subsystem_filter_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_system_enable_fops = {
 	.open = tracing_open_generic,
 	.read = system_enable_read,
 	.write = system_enable_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
+	.llseek = default_llseek,
 };
 
 static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
 	.open		= tracing_open_generic,
 	.read		= stack_max_size_read,
 	.write		= stack_max_size_write,
+	.llseek		= default_llseek,
 };
 
 static void *
-- 
cgit v1.2.3


From 907f27840985fe6a0c62e43cd4702c6e04b4bcc7 Mon Sep 17 00:00:00 2001
From: matt mooney <mfm@muteddisk.com>
Date: Mon, 27 Sep 2010 19:04:53 -0700
Subject: tracing/trivial: Remove cast from void*

Unnecessary cast from void* in assignment.

Signed-off-by: matt mooney <mfm@muteddisk.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++--
 kernel/trace/trace.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65fb077ea79c..ebd80d50c474 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
 
 	ret = ftrace_avail_open(inode, file);
 	if (!ret) {
-		m = (struct seq_file *)file->private_data;
-		iter = (struct ftrace_iterator *)m->private;
+		m = file->private_data;
+		iter = m->private;
 		iter->flags = FTRACE_ITER_FAILURES;
 	}
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..001bcd2ccf4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 
 static int tracing_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	struct trace_iterator *iter;
 	int cpu;
 
-- 
cgit v1.2.3


From a9d61173dc1cb63e660ae89e874e51ba4fd2f991 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 24 Sep 2010 17:41:02 +0200
Subject: tracing: Add proper check for irq_depth routines

The check_irq_entry and check_irq_return could be called
from graph event context. In such case there's no graph
private data allocated. Adding checks to handle this case.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <20100924154102.GB1818@jolsa.brq.redhat.com>

[ Fixed some grammar in the comments ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ef49e9370b25..4c58ccc6427c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -888,12 +888,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
 		unsigned long addr, int depth)
 {
 	int cpu = iter->cpu;
+	int *depth_irq;
 	struct fgraph_data *data = iter->private;
-	int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
 
-	if (flags & TRACE_GRAPH_PRINT_IRQS)
+	/*
+	 * If we are either displaying irqs, or we got called as
+	 * a graph event and private data does not exist,
+	 * then we bypass the irq check.
+	 */
+	if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+	    (!data))
 		return 0;
 
+	depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
 	/*
 	 * We are inside the irq code
 	 */
@@ -926,12 +934,20 @@ static int
 check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
 {
 	int cpu = iter->cpu;
+	int *depth_irq;
 	struct fgraph_data *data = iter->private;
-	int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
 
-	if (flags & TRACE_GRAPH_PRINT_IRQS)
+	/*
+	 * If we are either displaying irqs, or we got called as
+	 * a graph event and private data does not exist,
+	 * then we bypass the irq check.
+	 */
+	if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+	    (!data))
 		return 0;
 
+	depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
 	/*
 	 * We are not inside the irq code.
 	 */
-- 
cgit v1.2.3


From 0a772620a2e21fb55a02f70fe38d4b5c3a5fbbbf Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 23 Sep 2010 14:00:52 +0200
Subject: tracing: Make graph related irqs/preemptsoff functions global

Move trace_graph_function() and print_graph_headers_flags() functions
to the trace_function_graph.c to be globaly available.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1285243253-7372-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                 |  4 +++
 kernel/trace/trace_functions_graph.c | 63 ++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_irqsoff.c         | 56 ++++----------------------------
 3 files changed, 71 insertions(+), 52 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags, int pc);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4c58ccc6427c..6f8fe28acba1 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -262,6 +262,35 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
 		return trace_graph_entry(trace);
 }
 
+static void
+__trace_graph_function(struct trace_array *tr,
+		unsigned long ip, unsigned long flags, int pc)
+{
+	u64 time = trace_clock_local();
+	struct ftrace_graph_ent ent = {
+		.func  = ip,
+		.depth = 0,
+	};
+	struct ftrace_graph_ret ret = {
+		.func     = ip,
+		.depth    = 0,
+		.calltime = time,
+		.rettime  = time,
+	};
+
+	__trace_graph_entry(tr, &ent, flags, pc);
+	__trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+		unsigned long ip, unsigned long parent_ip,
+		unsigned long flags, int pc)
+{
+	__trace_graph_function(tr, parent_ip, flags, pc);
+	__trace_graph_function(tr, ip, flags, pc);
+}
+
 void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
@@ -1179,7 +1208,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	struct ftrace_graph_ent_entry *field;
 	struct fgraph_data *data = iter->private;
@@ -1242,7 +1271,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-	return print_graph_function_flags(iter, tracer_flags.val);
+	return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+					     u32 flags)
+{
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	return __print_graph_function_flags(iter, flags);
 }
 
 static enum print_line_t
@@ -1274,7 +1314,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
 	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
 	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
@@ -1315,6 +1355,23 @@ void print_graph_headers(struct seq_file *s)
 	print_graph_headers_flags(s, tracer_flags.val);
 }
 
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+	struct trace_iterator *iter = s->private;
+
+	if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+		/* print nothing if the buffers are empty */
+		if (trace_empty(iter))
+			return;
+
+		print_trace_header(s, iter);
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	} else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	__print_graph_headers_flags(s, flags);
+}
+
 void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..4047e98afcba 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
-	u32 flags = GRAPH_TRACER_FLAGS;
-
-	if (trace_flags & TRACE_ITER_LATENCY_FMT)
-		flags |= TRACE_GRAPH_PRINT_DURATION;
-	else
-		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
 	/*
 	 * In graph mode call the graph tracer output function,
 	 * otherwise go with the TRACE_FN event handler
 	 */
 	if (is_graph())
-		return print_graph_function_flags(iter, flags);
+		return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
 
 	return TRACE_TYPE_UNHANDLED;
 }
 
 static void irqsoff_print_header(struct seq_file *s)
 {
-	if (is_graph()) {
-		struct trace_iterator *iter = s->private;
-		u32 flags = GRAPH_TRACER_FLAGS;
-
-		if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-			/* print nothing if the buffers are empty */
-			if (trace_empty(iter))
-				return;
-
-			print_trace_header(s, iter);
-			flags |= TRACE_GRAPH_PRINT_DURATION;
-		} else
-			flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
-		print_graph_headers_flags(s, flags);
-	} else
+	if (is_graph())
+		print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+	else
 		trace_default_header(s);
 }
 
-static void
-trace_graph_function(struct trace_array *tr,
-		 unsigned long ip, unsigned long flags, int pc)
-{
-	u64 time = trace_clock_local();
-	struct ftrace_graph_ent ent = {
-		.func  = ip,
-		.depth = 0,
-	};
-	struct ftrace_graph_ret ret = {
-		.func     = ip,
-		.depth    = 0,
-		.calltime = time,
-		.rettime  = time,
-	};
-
-	__trace_graph_entry(tr, &ent, flags, pc);
-	__trace_graph_return(tr, &ret, flags, pc);
-}
-
 static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
 		 unsigned long flags, int pc)
 {
-	if (!is_graph())
+	if (is_graph())
+		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	else
 		trace_function(tr, ip, parent_ip, flags, pc);
-	else {
-		trace_graph_function(tr, parent_ip, flags, pc);
-		trace_graph_function(tr, ip, flags, pc);
-	}
 }
 
 #else
-- 
cgit v1.2.3


From 7495a5beaa22f190f4888aa8cbe4827c16575d0a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 23 Sep 2010 14:00:53 +0200
Subject: tracing: Graph support for wakeup tracer

Add function graph support for wakeup latency tracer.
The graph output is enabled by setting the 'display-graph'
trace option.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1285243253-7372-4-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 231 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 221 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..033510dbb322 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,13 +31,33 @@ static int			wakeup_rt;
 static arch_spinlock_t wakeup_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
+static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 
 static int save_lat_flag;
 
+#define TRACE_DISPLAY_GRAPH     1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* display latency trace as call graph */
+	{ TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val  = 0,
+	.opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * wakeup uses its own tracer function to keep the overhead down:
  */
 static void
 wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
@@ -80,8 +100,191 @@ static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = wakeup_tracer_call,
 };
+
+static int start_func_tracer(int graph)
+{
+	int ret;
+
+	if (!graph)
+		ret = register_ftrace_function(&trace_ops);
+	else
+		ret = register_ftrace_graph(&wakeup_graph_return,
+					    &wakeup_graph_entry);
+
+	if (!ret && tracing_is_enabled())
+		tracer_enabled = 1;
+	else
+		tracer_enabled = 0;
+
+	return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+	tracer_enabled = 0;
+
+	if (!graph)
+		unregister_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_graph();
+}
+
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+	if (!(bit & TRACE_DISPLAY_GRAPH))
+		return -EINVAL;
+
+	if (!(is_graph() ^ set))
+		return 0;
+
+	stop_func_tracer(!set);
+
+	wakeup_reset(wakeup_trace);
+	tracing_max_latency = 0;
+
+	return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, pc, ret = 0;
+
+	if (likely(!wakeup_task))
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	if (cpu != wakeup_current_cpu)
+		goto out_enable;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	local_save_flags(flags);
+	ret = __trace_graph_entry(tr, trace, flags, pc);
+
+out:
+	atomic_dec(&data->disabled);
+
+out_enable:
+	preempt_enable_notrace();
+	return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, pc;
+
+	if (likely(!wakeup_task))
+		return;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	if (cpu != wakeup_current_cpu)
+		goto out_enable;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	local_save_flags(flags);
+	__trace_graph_return(tr, trace, flags, pc);
+
+out:
+	atomic_dec(&data->disabled);
+
+out_enable:
+	preempt_enable_notrace();
+	return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+	if (is_graph())
+		graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+	if (iter->private)
+		graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+	/*
+	 * In graph mode call the graph tracer output function,
+	 * otherwise go with the TRACE_FN event handler
+	 */
+	if (is_graph())
+		return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+	if (is_graph())
+		print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+	else
+		trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (is_graph())
+		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	else
+		trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -152,7 +355,7 @@ probe_wakeup_sched_switch(void *ignore,
 	/* The task we are waiting for is waking up */
 	data = wakeup_trace->data[wakeup_cpu];
 
-	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	T0 = data->preempt_timestamp;
@@ -252,7 +455,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 	 * is not called by an assembly function  (where as schedule is)
 	 * it should be safe to use it here.
 	 */
-	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
@@ -303,12 +506,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	 */
 	smp_wmb();
 
-	register_ftrace_function(&trace_ops);
-
-	if (tracing_is_enabled())
-		tracer_enabled = 1;
-	else
-		tracer_enabled = 0;
+	if (start_func_tracer(is_graph()))
+		printk(KERN_ERR "failed to start wakeup tracer\n");
 
 	return;
 fail_deprobe_wake_new:
@@ -320,7 +519,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
+	stop_func_tracer(is_graph());
 	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
 	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 	unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +578,15 @@ static struct tracer wakeup_tracer __read_mostly =
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
 	.print_max	= 1,
+	.print_header	= wakeup_print_header,
+	.print_line	= wakeup_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.open		= wakeup_trace_open,
+	.close		= wakeup_trace_close,
 	.use_max_tr	= 1,
 };
 
@@ -394,9 +599,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.stop		= wakeup_tracer_stop,
 	.wait_pipe	= poll_wait_pipe,
 	.print_max	= 1,
+	.print_header	= wakeup_print_header,
+	.print_line	= wakeup_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= wakeup_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.open		= wakeup_trace_open,
+	.close		= wakeup_trace_close,
 	.use_max_tr	= 1,
 };
 
-- 
cgit v1.2.3


From 542181d3769d001c59cd17573dd4381e87d215f2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 Oct 2010 16:38:49 -0400
Subject: tracing: Use one prologue for the wakeup tracer function tracers

The wakeup tracer has three types of function tracers. Normal
function tracer, function graph entry, and function graph return.
Each of these use a complex dance to prevent recursion and whether
to trace the data or not (depending on the wake_task variable).

This patch moves the duplicate code into a single routine, to
prevent future mistakes with modifying duplicate complex code.

Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 102 +++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 52 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 033510dbb322..31689d2df7f3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -56,43 +56,73 @@ static struct tracer_flags tracer_flags = {
 #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 
 #ifdef CONFIG_FUNCTION_TRACER
+
 /*
- * wakeup uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
  */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+			    struct trace_array_cpu **data,
+			    int *pc)
 {
-	struct trace_array *tr = wakeup_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
 
 	if (likely(!wakeup_task))
-		return;
+		return 0;
 
-	pc = preempt_count();
+	*pc = preempt_count();
 	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	if (cpu != wakeup_current_cpu)
 		goto out_enable;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
+	*data = tr->data[cpu];
+	disabled = atomic_inc_return(&(*data)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
-	local_irq_save(flags);
+	return 1;
 
-	trace_function(tr, ip, parent_ip, flags, pc);
+out:
+	atomic_dec(&(*data)->disabled);
+
+out_enable:
+	preempt_enable_notrace();
+	return 0;
+}
+
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	int pc;
+
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
+		return;
 
+	local_irq_save(flags);
+	trace_function(tr, ip, parent_ip, flags, pc);
 	local_irq_restore(flags);
 
- out:
 	atomic_dec(&data->disabled);
- out_enable:
 	preempt_enable_notrace();
 }
 
@@ -154,32 +184,16 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
-	int cpu, pc, ret = 0;
+	int pc, ret = 0;
 
-	if (likely(!wakeup_task))
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
 		return 0;
 
-	pc = preempt_count();
-	preempt_disable_notrace();
-
-	cpu = raw_smp_processor_id();
-	if (cpu != wakeup_current_cpu)
-		goto out_enable;
-
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-	if (unlikely(disabled != 1))
-		goto out;
-
 	local_save_flags(flags);
 	ret = __trace_graph_entry(tr, trace, flags, pc);
-
-out:
 	atomic_dec(&data->disabled);
-
-out_enable:
 	preempt_enable_notrace();
+
 	return ret;
 }
 
@@ -188,31 +202,15 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
-	int cpu, pc;
+	int pc;
 
-	if (likely(!wakeup_task))
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
 		return;
 
-	pc = preempt_count();
-	preempt_disable_notrace();
-
-	cpu = raw_smp_processor_id();
-	if (cpu != wakeup_current_cpu)
-		goto out_enable;
-
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-	if (unlikely(disabled != 1))
-		goto out;
-
 	local_save_flags(flags);
 	__trace_graph_return(tr, trace, flags, pc);
-
-out:
 	atomic_dec(&data->disabled);
 
-out_enable:
 	preempt_enable_notrace();
 	return;
 }
-- 
cgit v1.2.3


From 5e6d2b9cfa3a6e7fe62fc0135bc1bd778f5db564 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 Oct 2010 19:41:43 -0400
Subject: tracing: Use one prologue for the preempt irqs off tracer function
 tracers

The preempt and irqsoff tracers have three types of function tracers.
Normal function tracer, function graph entry, and function graph return.
Each of these use a complex dance to prevent recursion and whether
to trace the data or not (depending if interrupts are enabled or not).

This patch moves the duplicate code into a single routine, to
prevent future mistakes with modifying duplicate complex code.

Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_irqsoff.c | 96 ++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 4047e98afcba..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp	unsigned long max_sequence;
 
 #ifdef CONFIG_FUNCTION_TRACER
 /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
  */
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+			   struct trace_array_cpu **data,
+			   unsigned long *flags)
 {
-	struct trace_array *tr = irqsoff_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
 	long disabled;
 	int cpu;
 
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	 */
 	cpu = raw_smp_processor_id();
 	if (likely(!per_cpu(tracing_cpu, cpu)))
-		return;
+		return 0;
 
-	local_save_flags(flags);
+	local_save_flags(*flags);
 	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
-		return;
+	if (!irqs_disabled_flags(*flags))
+		return 0;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
+	*data = tr->data[cpu];
+	disabled = atomic_inc_return(&(*data)->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, ip, parent_ip, flags, preempt_count());
+		return 1;
+
+	atomic_dec(&(*data)->disabled);
+
+	return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (!func_prolog_dec(tr, &data, &flags))
+		return;
+
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	atomic_dec(&data->disabled);
 }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
 	int ret;
-	int cpu;
 	int pc;
 
-	cpu = raw_smp_processor_id();
-	if (likely(!per_cpu(tracing_cpu, cpu)))
-		return 0;
-
-	local_save_flags(flags);
-	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
+	if (!func_prolog_dec(tr, &data, &flags))
 		return 0;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		ret = __trace_graph_entry(tr, trace, flags, pc);
-	} else
-		ret = 0;
-
+	pc = preempt_count();
+	ret = __trace_graph_entry(tr, trace, flags, pc);
 	atomic_dec(&data->disabled);
+
 	return ret;
 }
 
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	long disabled;
-	int cpu;
 	int pc;
 
-	cpu = raw_smp_processor_id();
-	if (likely(!per_cpu(tracing_cpu, cpu)))
-		return;
-
-	local_save_flags(flags);
-	/* slight chance to get a false positive on tracing_cpu */
-	if (!irqs_disabled_flags(flags))
+	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		__trace_graph_return(tr, trace, flags, pc);
-	}
-
+	pc = preempt_count();
+	__trace_graph_return(tr, trace, flags, pc);
 	atomic_dec(&data->disabled);
 }
 
-- 
cgit v1.2.3


From 78c89ba121221d9224a5747803d7fffe51cd6e44 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 Oct 2010 23:22:19 -0400
Subject: tracing: Remove parent recording in latency tracer graph options

Even though the parent is recorded with the normal function tracing
of the latency tracers (irqsoff and wakeup), the function graph
recording is bogus.

This is due to the function graph messing with the return stack.
The latency tracers pass in as the parent CALLER_ADDR0, which
works fine for plain function tracing. But this causes bogus output
with the graph tracer:

 3)    <idle>-0    |  d.s3.  0.000 us    |  return_to_handler();
 3)    <idle>-0    |  d.s3.  0.000 us    |  _raw_spin_unlock_irqrestore();
 3)    <idle>-0    |  d.s3.  0.000 us    |  return_to_handler();
 3)    <idle>-0    |  d.s3.  0.000 us    |  trace_hardirqs_on();

The "return_to_handle()" call is the trampoline of the
function graph tracer, and is meaningless in this context.

Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f8fe28acba1..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -287,7 +287,6 @@ trace_graph_function(struct trace_array *tr,
 		unsigned long ip, unsigned long parent_ip,
 		unsigned long flags, int pc)
 {
-	__trace_graph_function(tr, parent_ip, flags, pc);
 	__trace_graph_function(tr, ip, flags, pc);
 }
 
-- 
cgit v1.2.3


From 01b284f9b6d51cc3f3bcf3b49f16d2601d3ca22d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Sep 2010 20:39:22 +0200
Subject: blktrace: remove the big kernel lock

According to Jens, this code does not need the BKL at all,
it is sufficiently serialized by bd_mutex.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jens Axboe <jaxboe@fusionio.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/blktrace.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..5328e8779d4d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
 
@@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	if (!q)
 		return -ENXIO;
 
-	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 
 	switch (cmd) {
@@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	}
 
 	mutex_unlock(&bdev->bd_mutex);
-	unlock_kernel();
 	return ret;
 }
 
@@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	struct block_device *bdev;
 	ssize_t ret = -ENXIO;
 
-	lock_kernel();
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
-		goto out_unlock_kernel;
+		goto out;
 
 	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
@@ -1683,8 +1679,7 @@ out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
+out:
 	return ret;
 }
 
@@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	ret = -ENXIO;
 
-	lock_kernel();
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
-		goto out_unlock_kernel;
+		goto out;
 
 	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
@@ -1753,8 +1747,6 @@ out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
 out:
 	return ret ? ret : count;
 }
-- 
cgit v1.2.3


From 7e40798f406fe73f9bac496a390daabd8768a8f7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 19 Oct 2010 10:56:19 -0400
Subject: tracing: Fix compile issue for trace_sched_wakeup.c

The function start_func_tracer() was incorrectly added in the
 #ifdef CONFIG_FUNCTION_TRACER condition, but is still used even
when function tracing is not enabled.

The calls to register_ftrace_function() and register_ftrace_graph()
become nops (and their arguments are even ignored), thus there is
no reason to hide start_func_tracer() when function tracing is
not enabled.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 31689d2df7f3..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,6 +130,7 @@ static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = wakeup_tracer_call,
 };
+#endif /* CONFIG_FUNCTION_TRACER */
 
 static int start_func_tracer(int graph)
 {
@@ -159,8 +160,6 @@ static void stop_func_tracer(int graph)
 		unregister_ftrace_graph();
 }
 
-#endif /* CONFIG_FUNCTION_TRACER */
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
 {
-- 
cgit v1.2.3


From 747e94ae3d1b4c9bf5380e569f614eb9040b79e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 Oct 2010 13:51:48 -0400
Subject: ring-buffer: Make write slow path out of line

Gcc inlines the slow path of the ring buffer write which can
hurt performance. This patch simply forces the slow path function
rb_move_tail() to always be a function.

The ring_buffer_benchmark module with reader_disabled=1 shows that
this patch changes the time to record an event from 135 ns to
132 ns. (3 ns or 2.22% improvement)

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bca96377fd4e..0b88df849a59 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1823,7 +1823,10 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	     unsigned long length, unsigned long tail,
 	     struct buffer_page *tail_page, u64 *ts)
@@ -1943,7 +1946,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE)
+	if (unlikely(write > BUF_PAGE_SIZE))
 		return rb_move_tail(cpu_buffer, length, tail,
 				    tail_page, ts);
 
-- 
cgit v1.2.3


From e8bc43e84fada397af1b677b07dbf26e6ac78fcc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 Oct 2010 10:58:02 -0400
Subject: ring-buffer: Pass timestamp by value and not by reference

The original code for the ring buffer had locations that modified
the timestamp and that change was used by the callers. Now,
the timestamp is not reused by the callers and there is no reason
to pass it by reference.

By changing the call to pass by value, lets gcc optimize the code
a bit more where it can store the timestamp in a register and not
worry about updating the reference.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0b88df849a59..c8ce6bde7fa4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1829,7 +1829,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	     unsigned long length, unsigned long tail,
-	     struct buffer_page *tail_page, u64 *ts)
+	     struct buffer_page *tail_page, u64 ts)
 {
 	struct buffer_page *commit_page = cpu_buffer->commit_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1912,8 +1912,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		 * Nested commits always have zero deltas, so
 		 * just reread the time stamp
 		 */
-		*ts = rb_time_stamp(buffer);
-		next_page->page->time_stamp = *ts;
+		ts = rb_time_stamp(buffer);
+		next_page->page->time_stamp = ts;
 	}
 
  out_again:
@@ -1932,7 +1932,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned type, unsigned long length, u64 *ts)
+		  unsigned type, unsigned long length, u64 ts)
 {
 	struct buffer_page *tail_page;
 	struct ring_buffer_event *event;
@@ -1965,7 +1965,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * its timestamp.
 	 */
 	if (!tail)
-		tail_page->page->time_stamp = *ts;
+		tail_page->page->time_stamp = ts;
 
 	return event;
 }
@@ -2008,7 +2008,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		  u64 *ts, u64 *delta)
+		  u64 ts, u64 *delta)
 {
 	struct ring_buffer_event *event;
 	int ret;
@@ -2016,7 +2016,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 	WARN_ONCE(*delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
 		  (unsigned long long)*delta,
-		  (unsigned long long)*ts,
+		  (unsigned long long)ts,
 		  (unsigned long long)cpu_buffer->write_stamp);
 
 	/*
@@ -2051,7 +2051,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 				event->array[0] = 0;
 			}
 		}
-		cpu_buffer->write_stamp = *ts;
+		cpu_buffer->write_stamp = ts;
 		/* let the caller know this was the commit */
 		ret = 1;
 	} else {
@@ -2175,7 +2175,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 
-			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+			commit = rb_add_time_stamp(cpu_buffer, ts, &delta);
 			if (commit == -EBUSY)
 				goto out_fail;
 
@@ -2187,7 +2187,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	}
 
  get_event:
-	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+	event = __rb_reserve_next(cpu_buffer, 0, length, ts);
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
-- 
cgit v1.2.3


From f25106aeab7408394b9dd707e5ecf557e269c723 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 Oct 2010 12:40:12 -0400
Subject: ring-buffer: Pass delta by value and not by reference

The delta between events is passed to the timestamp code by reference
and the timestamp code will reset the value. But it can be reset
from the caller. No need to pass it in by reference.

By changing the call to pass by value, lets gcc optimize the code
a bit more where it can store the delta in a register and not
worry about updating the reference.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c8ce6bde7fa4..3af77cd47f21 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2008,14 +2008,14 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		  u64 ts, u64 *delta)
+		  u64 ts, u64 delta)
 {
 	struct ring_buffer_event *event;
 	int ret;
 
-	WARN_ONCE(*delta > (1ULL << 59),
+	WARN_ONCE(delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-		  (unsigned long long)*delta,
+		  (unsigned long long)delta,
 		  (unsigned long long)ts,
 		  (unsigned long long)cpu_buffer->write_stamp);
 
@@ -2041,8 +2041,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		 * and if we can't just make it zero.
 		 */
 		if (rb_event_index(event)) {
-			event->time_delta = *delta & TS_MASK;
-			event->array[0] = *delta >> TS_SHIFT;
+			event->time_delta = delta & TS_MASK;
+			event->array[0] = delta >> TS_SHIFT;
 		} else {
 			/* try to discard, since we do not need this */
 			if (!rb_try_to_discard(cpu_buffer, event)) {
@@ -2064,8 +2064,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		ret = 0;
 	}
 
-	*delta = 0;
-
 	return ret;
 }
 
@@ -2175,7 +2173,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 
-			commit = rb_add_time_stamp(cpu_buffer, ts, &delta);
+			commit = rb_add_time_stamp(cpu_buffer, ts, delta);
+			delta = 0;
+
 			if (commit == -EBUSY)
 				goto out_fail;
 
-- 
cgit v1.2.3


From 69d1b839f7eee347e357b3f6cce7f630cc6ff93d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 Oct 2010 18:18:05 -0400
Subject: ring-buffer: Bind time extend and data events together

When the time between two timestamps is greater than
2^27 nanosecs (~134 ms) a time extend event is added that extends
the time difference to 59 bits (~18 years). This is due to
events only having a 27 bit field to store time.

Currently this time extend is a separate event. We add it just before
the event data that is being written to the buffer. But before
the event data is committed, the event data can also be discarded (as
with the case of filters). But because the time extend has already been
committed, it will stay in the buffer.

If lots of events are being filtered and no event is being
written, then every 134ms a time extend can be added to the buffer
without any data attached. To keep from filling the entire buffer
with time extends, a time extend will never be the first event
in a page because the page timestamp can be used. Time extends can
only fill the rest of a page with some data at the beginning.

This patch binds the time extend with the data. The difference here
is that the time extend is not committed before the data is added.
Instead, when a time extend is needed, the space reserved on
the ring buffer is the time extend + the data event size. The
time extend is added to the first part of the reserved block and
the data is added to the second. The time extend event is passed
back to the reserver, but since the reserver also uses a function
to find the data portion of the reserved block, no changes to the
ring buffer interface need to be made.

When a commit is discarded, we now remove both the time extend and
the event. With this approach no more than one time extend can
be in the buffer in a row. Data must always follow a time extend.

Thanks to Mathieu Desnoyers for suggesting this idea.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++---------------------
 1 file changed, 142 insertions(+), 124 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3af77cd47f21..f50f43107e93 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
+#define skip_time_extend(event) \
+	((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
 	return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 	return length + RB_EVNT_HDR_SIZE;
 }
 
-/* inline for ring buffer fast paths */
-static unsigned
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
 	switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
 	return 0;
 }
 
+/*
+ * Return total length of time extend and data,
+ *   or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+	unsigned len = 0;
+
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+		/* time extends include the data event after it */
+		len = RB_LEN_TIME_EXTEND;
+		event = skip_time_extend(event);
+	}
+	return len + rb_event_length(event);
+}
+
 /**
  * ring_buffer_event_length - return the length of the event
  * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
  */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-	unsigned length = rb_event_length(event);
+	unsigned length;
+
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+		event = skip_time_extend(event);
+
+	length = rb_event_length(event);
 	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+		event = skip_time_extend(event);
 	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
 	if (event->type_len)
@@ -1546,6 +1583,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	iter->head = 0;
 }
 
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+	/* Not the first event on the page? */
+	if (rb_event_index(event)) {
+		event->time_delta = delta & TS_MASK;
+		event->array[0] = delta >> TS_SHIFT;
+	} else {
+		/* nope, just zero it */
+		event->time_delta = 0;
+		event->array[0] = 0;
+	}
+
+	return skip_time_extend(event);
+}
+
 /**
  * ring_buffer_update_event - update event type and data
  * @event: the even to update
@@ -1558,28 +1614,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
  * data field.
  */
 static void
-rb_update_event(struct ring_buffer_event *event,
-			 unsigned type, unsigned length)
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+		struct ring_buffer_event *event, unsigned length,
+		int add_timestamp, u64 delta)
 {
-	event->type_len = type;
-
-	switch (type) {
-
-	case RINGBUF_TYPE_PADDING:
-	case RINGBUF_TYPE_TIME_EXTEND:
-	case RINGBUF_TYPE_TIME_STAMP:
-		break;
+	/* Only a commit updates the timestamp */
+	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+		delta = 0;
 
-	case 0:
-		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-			event->array[0] = length;
-		else
-			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-		break;
-	default:
-		BUG();
+	/*
+	 * If we need to add a timestamp, then we
+	 * add it to the start of the resevered space.
+	 */
+	if (unlikely(add_timestamp)) {
+		event = rb_add_time_stamp(event, delta);
+		length -= RB_LEN_TIME_EXTEND;
+		delta = 0;
 	}
+
+	event->time_delta = delta;
+	length -= RB_EVNT_HDR_SIZE;
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+		event->type_len = 0;
+		event->array[0] = length;
+	} else
+		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 }
 
 /*
@@ -1932,12 +1991,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned type, unsigned long length, u64 ts)
+		  unsigned long length, u64 ts,
+		  u64 delta, int add_timestamp)
 {
 	struct buffer_page *tail_page;
 	struct ring_buffer_event *event;
 	unsigned long tail, write;
 
+	/*
+	 * If the time delta since the last event is too big to
+	 * hold in the time field of the event, then we append a
+	 * TIME EXTEND event ahead of the data event.
+	 */
+	if (unlikely(add_timestamp))
+		length += RB_LEN_TIME_EXTEND;
+
 	tail_page = cpu_buffer->tail_page;
 	write = local_add_return(length, &tail_page->write);
 
@@ -1954,11 +2022,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
-	rb_update_event(event, type, length);
+	rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
 
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
+	local_inc(&tail_page->entries);
 
 	/*
 	 * If this is the first commit on the page, then update
@@ -1980,7 +2046,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned long addr;
 
 	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_length(event);
+	old_index = new_index + rb_event_ts_length(event);
 	addr = (unsigned long)event;
 	addr &= PAGE_MASK;
 
@@ -2006,67 +2072,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	return 0;
 }
 
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		  u64 ts, u64 delta)
-{
-	struct ring_buffer_event *event;
-	int ret;
-
-	WARN_ONCE(delta > (1ULL << 59),
-		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-		  (unsigned long long)delta,
-		  (unsigned long long)ts,
-		  (unsigned long long)cpu_buffer->write_stamp);
-
-	/*
-	 * The delta is too big, we to add a
-	 * new timestamp.
-	 */
-	event = __rb_reserve_next(cpu_buffer,
-				  RINGBUF_TYPE_TIME_EXTEND,
-				  RB_LEN_TIME_EXTEND,
-				  ts);
-	if (!event)
-		return -EBUSY;
-
-	if (PTR_ERR(event) == -EAGAIN)
-		return -EAGAIN;
-
-	/* Only a commited time event can update the write stamp */
-	if (rb_event_is_commit(cpu_buffer, event)) {
-		/*
-		 * If this is the first on the page, then it was
-		 * updated with the page itself. Try to discard it
-		 * and if we can't just make it zero.
-		 */
-		if (rb_event_index(event)) {
-			event->time_delta = delta & TS_MASK;
-			event->array[0] = delta >> TS_SHIFT;
-		} else {
-			/* try to discard, since we do not need this */
-			if (!rb_try_to_discard(cpu_buffer, event)) {
-				/* nope, just zero it */
-				event->time_delta = 0;
-				event->array[0] = 0;
-			}
-		}
-		cpu_buffer->write_stamp = ts;
-		/* let the caller know this was the commit */
-		ret = 1;
-	} else {
-		/* Try to discard the event */
-		if (!rb_try_to_discard(cpu_buffer, event)) {
-			/* Darn, this is just wasted space */
-			event->time_delta = 0;
-			event->array[0] = 0;
-		}
-		ret = 0;
-	}
-
-	return ret;
-}
-
 static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	local_inc(&cpu_buffer->committing);
@@ -2111,9 +2116,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta = 0;
-	int commit = 0;
+	u64 ts, delta;
 	int nr_loops = 0;
+	int add_timestamp;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2134,6 +2139,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 
 	length = rb_calculate_event_length(length);
  again:
+	add_timestamp = 0;
+	delta = 0;
+
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
 	 * If one does, it will cause this original code to loop
@@ -2172,33 +2180,24 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
-
-			commit = rb_add_time_stamp(cpu_buffer, ts, delta);
-			delta = 0;
-
-			if (commit == -EBUSY)
-				goto out_fail;
-
-			if (commit == -EAGAIN)
-				goto again;
-
-			RB_WARN_ON(cpu_buffer, commit < 0);
+			WARN_ONCE(delta > (1ULL << 59),
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  (unsigned long long)delta,
+				  (unsigned long long)ts,
+				  (unsigned long long)cpu_buffer->write_stamp);
+			add_timestamp = 1;
 		}
 	}
 
  get_event:
-	event = __rb_reserve_next(cpu_buffer, 0, length, ts);
+	event = __rb_reserve_next(cpu_buffer, length, ts,
+				  delta, add_timestamp);
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
 	if (!event)
 		goto out_fail;
 
-	if (!rb_event_is_commit(cpu_buffer, event))
-		delta = 0;
-
-	event->time_delta = delta;
-
 	return event;
 
  out_fail:
@@ -2311,12 +2310,28 @@ static void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
+	u64 delta;
+
 	/*
 	 * The event first in the commit queue updates the
 	 * time stamp.
 	 */
-	if (rb_event_is_commit(cpu_buffer, event))
-		cpu_buffer->write_stamp += event->time_delta;
+	if (rb_event_is_commit(cpu_buffer, event)) {
+		/*
+		 * A commit event that is first on a page
+		 * updates the write timestamp with the page stamp
+		 */
+		if (!rb_event_index(event))
+			cpu_buffer->write_stamp =
+				cpu_buffer->commit_page->page->time_stamp;
+		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+			delta = event->array[0];
+			delta <<= TS_SHIFT;
+			delta += event->time_delta;
+			cpu_buffer->write_stamp += delta;
+		} else
+			cpu_buffer->write_stamp += event->time_delta;
+	}
 }
 
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2356,6 +2371,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+		event = skip_time_extend(event);
+
 	/* array[0] holds the actual length for the discarded event */
 	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
 	event->type_len = RINGBUF_TYPE_PADDING;
@@ -3043,12 +3061,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 
  again:
 	/*
-	 * We repeat when a timestamp is encountered. It is possible
-	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written, or from discarded
-	 * commits. The most that we can have is the number on a single page.
+	 * We repeat when a time extend is encountered.
+	 * Since the time extend is always attached to a data event,
+	 * we should never loop more than once.
+	 * (We never hit the following condition more than twice).
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
 		return NULL;
 
 	reader = rb_get_reader_page(cpu_buffer);
@@ -3124,14 +3142,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 		return NULL;
 
 	/*
-	 * We repeat when a timestamp is encountered.
-	 * We can get multiple timestamps by nested interrupts or also
-	 * if filtering is on (discarding commits). Since discarding
-	 * commits can be frequent we can get a lot of timestamps.
-	 * But we limit them by not adding timestamps if they begin
-	 * at the start of a page.
+	 * We repeat when a time extend is encountered.
+	 * Since the time extend is always attached to a data event,
+	 * we should never loop more than once.
+	 * (We never hit the following condition more than twice).
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
@@ -3829,7 +3845,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (len > (commit - read))
 			len = (commit - read);
 
-		size = rb_event_length(event);
+		/* Always keep the time extend and data together */
+		size = rb_event_ts_length(event);
 
 		if (len < size)
 			goto out_unlock;
@@ -3851,7 +3868,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 				break;
 
 			event = rb_reader_event(cpu_buffer);
-			size = rb_event_length(event);
+			/* Always keep the time extend and data together */
+			size = rb_event_ts_length(event);
 		} while (len > size);
 
 		/* update bpage */
-- 
cgit v1.2.3


From 140ff89127c74b1b1c1b0152a36ea3720ccf6bc3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 Oct 2010 10:50:30 -0400
Subject: ring-buffer: Remove condition to add timestamp in fast path

There's a condition to check if we should add a time extend or
not in the fast path. But this condition is racey (in the sense
that we can add a unnecessary time extend, but nothing that
can break anything). We later check if the time or event time
delta should be zero or have real data in it (not racey), making
this first check redundant.

This check may help save space once in a while, but really is
not worth the hassle to try to save some space that happens at
most 134 ms at a time.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f50f43107e93..d9f3e7a82137 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2119,6 +2119,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	u64 ts, delta;
 	int nr_loops = 0;
 	int add_timestamp;
+	u64 diff;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2155,29 +2156,13 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		goto out_fail;
 
 	ts = rb_time_stamp(cpu_buffer->buffer);
+	diff = ts - cpu_buffer->write_stamp;
 
-	/*
-	 * Only the first commit can update the timestamp.
-	 * Yes there is a race here. If an interrupt comes in
-	 * just after the conditional and it traces too, then it
-	 * will also check the deltas. More than one timestamp may
-	 * also be made. But only the entry that did the actual
-	 * commit will be something other than zero.
-	 */
-	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-		   rb_page_write(cpu_buffer->tail_page) ==
-		   rb_commit_index(cpu_buffer))) {
-		u64 diff;
-
-		diff = ts - cpu_buffer->write_stamp;
-
-		/* make sure this diff is calculated here */
-		barrier();
-
-		/* Did the write stamp get updated already? */
-		if (unlikely(ts < cpu_buffer->write_stamp))
-			goto get_event;
+	/* make sure this diff is calculated here */
+	barrier();
 
+	/* Did the write stamp get updated already? */
+	if (likely(ts >= cpu_buffer->write_stamp)) {
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
@@ -2189,7 +2174,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		}
 	}
 
- get_event:
 	event = __rb_reserve_next(cpu_buffer, length, ts,
 				  delta, add_timestamp);
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
-- 
cgit v1.2.3


From d9abde2138e0a00a0d7e44676928efa0ef629d48 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 19 Oct 2010 13:17:08 -0400
Subject: ring-buffer: Micro-optimize with some strategic inlining

By using inline and noinline, we are able to make the fast path of
recording an event 4% faster.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9f3e7a82137..f5007d0d932d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2078,7 +2078,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 	local_inc(&cpu_buffer->commits);
 }
 
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	unsigned long commits;
 
@@ -2193,13 +2193,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 
 #define TRACE_RECURSIVE_DEPTH 16
 
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
 {
-	current->trace_recursion++;
-
-	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-		return 0;
-
 	/* Disable all tracing before we do anything else */
 	tracing_off_permanent();
 
@@ -2211,10 +2207,21 @@ static int trace_recursive_lock(void)
 		    in_nmi());
 
 	WARN_ON_ONCE(1);
+}
+
+static inline int trace_recursive_lock(void)
+{
+	current->trace_recursion++;
+
+	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+		return 0;
+
+	trace_recursive_fail();
+
 	return -1;
 }
 
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
 {
 	WARN_ON_ONCE(!current->trace_recursion);
 
-- 
cgit v1.2.3


From b8b2663bd7c9da04ac804659b9f617c199d0252c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 19 Oct 2010 13:23:25 -0400
Subject: ring-buffer: Remove unused macro RB_TIMESTAMPS_PER_PAGE

With the binding of time extends to events we no longer need to use
the macro RB_TIMESTAMPS_PER_PAGE. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5007d0d932d..ad25490f8b40 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -441,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
-
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
-- 
cgit v1.2.3


From dd49a38cf30944be27892c10b1c0e5b3fa73bcb2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 Oct 2010 21:51:26 -0400
Subject: tracing: Do not limit the size of the number of CPU buffers

The tracing per_cpu buffers were limited to 999 CPUs for a mear
savings in stack space of a char array. Up the array to 30 characters
which is more than enough to hold a 64 bit number.

Reported-by: Robin Holt <holt@sgi.com>
Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
 	struct dentry *d_cpu;
-	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
-	char cpu_dir[7];
+	char cpu_dir[30]; /* 30 characters should be more than enough */
 
-	if (cpu > 999 || cpu < 0)
-		return;
-
-	sprintf(cpu_dir, "cpu%ld", cpu);
+	snprintf(cpu_dir, 30, "cpu%ld", cpu);
 	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
 	if (!d_cpu) {
 		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
-- 
cgit v1.2.3


From e3bda3ac33d3bf3e5a4049e2cabe82d3caaffc26 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 11 Oct 2010 10:20:14 -0500
Subject: kdb,ftdump: Remove reference to internal kdb include

Now that include/linux/kdb.h properly exports all the functions
required to dynamically add a kdb shell command, the reference to the
private kdb header can be removed.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/trace/trace_kdb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
 #include <linux/kdb.h>
 #include <linux/ftrace.h>
 
-#include "../debug/kdb/kdb_private.h"
 #include "trace.h"
 #include "trace_output.h"
 
-- 
cgit v1.2.3


From aa7b250c252cc8e6b1daf0e1eada5eba42a1a68d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 21 Oct 2010 22:17:19 -0700
Subject: tracing: Fix 'faild' -> 'failed' typo

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Kosina <trivial@kernel.org>
LKML-Reference: <cd9855af60d7d90e9f55fc7afd0ed23fcdaa6f52.1287724261.git.joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..b8d2852baa4a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -648,7 +648,7 @@ static int register_trace_probe(struct trace_probe *tp)
 	}
 	ret = register_probe_event(tp);
 	if (ret) {
-		pr_warning("Faild to register probe event(%d)\n", ret);
+		pr_warning("Failed to register probe event(%d)\n", ret);
 		goto end;
 	}
 
-- 
cgit v1.2.3


From 61d8e11e519ee7912ab59610fba1aaf08e3c1d84 Mon Sep 17 00:00:00 2001
From: Zimny Lech <napohybelskurwysynom2010@gmail.com>
Date: Wed, 27 Oct 2010 15:34:53 -0700
Subject: Remove duplicate includes from many files

Signed-off-by: Zimny Lech <napohybelskurwysynom2010@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace_kprobe.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b8d2852baa4a..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
 #include <linux/perf_event.h>
 #include <linux/stringify.h>
 #include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <asm/bitsperlong.h>
 
 #include "trace.h"
-- 
cgit v1.2.3