From 9fe6b778ca93e6171dbb8e54df557a278a91abea Mon Sep 17 00:00:00 2001
From: Gil Fruchter <gilf@ezchip.com>
Date: Tue, 9 Jun 2015 10:32:34 +0300
Subject: tracing: Prefer kcalloc over kzalloc with multiply

Use kcalloc for allocating an array instead of kzalloc with multiply,
as that is what kcalloc is used for.
Found with checkpatch.

Link: http://lkml.kernel.org/r/1433835155-6894-2-git-send-email-gilf@ezchip.com

Signed-off-by: Gil Fruchter <gilf@ezchip.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..5d219384b4d1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 
-	iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+	iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter),
 				    GFP_KERNEL);
 	if (!iter->buffer_iter)
 		goto release;
-- 
cgit v1.2.3


From 72917235fd5f08638be1d52dcdb0fee3ce2cc95f Mon Sep 17 00:00:00 2001
From: Gil Fruchter <gilf@ezchip.com>
Date: Tue, 9 Jun 2015 10:32:35 +0300
Subject: tracing: Fix for non-continuous cpu ids

Currently exception occures due to access beyond buffer_iter
range while using index of cpu bigger than num_possible_cpus().
Below there is an example for such exception when we use
cpus 0,1,16,17.

In order to fix buffer allocation size for non-continuous cpu ids
we allocate according to the max cpu id and not according to the
amount of possible cpus.

Example:
  $ cat /sys/kernel/debug/tracing/per_cpu/cpu1/trace
  Path: /bin/busybox
  CPU: 0 PID: 82 Comm: cat Not tainted 4.0.0 #29
  task: 80734c80 ti: 80012000 task.ti: 80012000

  [ECR   ]: 0x00220100 => Invalid Read @ 0x00000000 by insn @ 0x800abafc
  [EFA   ]: 0x00000000
  [BLINK ]: ring_buffer_read_finish+0x24/0x64
  [ERET  ]: rb_check_pages+0x20/0x188
  [STAT32]: 0x00001a00 :
  BTA: 0x800abafc  SP: 0x80013f0c  FP: 0x57719cf8
  LPS: 0x200036b4 LPE: 0x200036b8 LPC: 0x00000000
  r00: 0x8002aca0 r01: 0x00001606 r02: 0x00000000
  r03: 0x00000001 r04: 0x00000000 r05: 0x804b4954
  r06: 0x00030003 r07: 0x8002a260 r08: 0x00000286
  r09: 0x00080002 r10: 0x00001006 r11: 0x807351a4
  r12: 0x00000001

  Stack Trace:
    rb_check_pages+0x20/0x188
    ring_buffer_read_finish+0x24/0x64
    tracing_release+0x4e/0x170
    __fput+0x62/0x158
    task_work_run+0xa2/0xd4
    do_notify_resume+0x52/0x7c
    resume_user_mode_begin+0xdc/0xe0

Link: http://lkml.kernel.org/r/1433835155-6894-3-git-send-email-gilf@ezchip.com

Signed-off-by: Noam Camus <noamc@ezchip.com>
Signed-off-by: Gil Fruchter <gilf@ezchip.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d219384b4d1..59814adc39d6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 
-	iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter),
+	iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
 				    GFP_KERNEL);
 	if (!iter->buffer_iter)
 		goto release;
-- 
cgit v1.2.3


From 5e2d5ef8ec1e3854daec41a3697a8d2ce05ff2ef Mon Sep 17 00:00:00 2001
From: Umesh Tiwari <umesh.t@samsung.com>
Date: Mon, 22 Jun 2015 16:55:06 +0530
Subject: ftrace: correct the counter increment for trace_buffer data

In ftrace_dump, for disabling buffer, iter.tr->trace_buffer.data is used.
But for enabling, iter.trace_buffer->data is used.
Even though, both point to same buffer, for readability, same convention
should be used.

Link: http://lkml.kernel.org/r/1434972306-20043-1-git-send-email-umesh.t@samsung.com

Signed-off-by: Umesh Tiwari <umesh.t@samsung.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 59814adc39d6..6e79408674aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
-- 
cgit v1.2.3


From fcc742eaad7cbcbbb2a96edc8f1d22adbaa804cb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 May 2015 17:13:14 -0400
Subject: ring-buffer: Add event descriptor to simplify passing data

Add rb_event_info descriptor to pass event info to functions a bit
easier than using a bunch of parameters. This will also allow for
changing the code around a bit to find better fast paths.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 91 ++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 39 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..ba8f25ffcf6f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -399,6 +399,17 @@ struct rb_irq_work {
 	bool				wakeup_full;
 };
 
+/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+	u64			ts;
+	u64			delta;
+	unsigned long		length;
+	struct buffer_page	*tail_page;
+	int			add_timestamp;
+};
+
 /*
  * Used for which event context the event is in.
  *  NMI     = 0
@@ -2000,9 +2011,12 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  */
 static void
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-		struct ring_buffer_event *event, unsigned length,
-		int add_timestamp, u64 delta)
+		struct ring_buffer_event *event,
+		struct rb_event_info *info)
 {
+	unsigned length = info->length;
+	u64 delta = info->delta;
+
 	/* Only a commit updates the timestamp */
 	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
 		delta = 0;
@@ -2011,7 +2025,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
 	 */
-	if (unlikely(add_timestamp)) {
+	if (unlikely(info->add_timestamp)) {
 		event = rb_add_time_stamp(event, delta);
 		length -= RB_LEN_TIME_EXTEND;
 		delta = 0;
@@ -2203,10 +2217,11 @@ static unsigned rb_calculate_event_length(unsigned length)
 
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
-	      struct buffer_page *tail_page,
-	      unsigned long tail, unsigned long length)
+	      unsigned long tail, struct rb_event_info *info)
 {
+	struct buffer_page *tail_page = info->tail_page;
 	struct ring_buffer_event *event;
+	unsigned long length = info->length;
 
 	/*
 	 * Only the event that crossed the page boundary
@@ -2276,13 +2291,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
  */
 static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
-	     unsigned long length, unsigned long tail,
-	     struct buffer_page *tail_page, u64 ts)
+	     unsigned long tail, struct rb_event_info *info)
 {
+	struct buffer_page *tail_page = info->tail_page;
 	struct buffer_page *commit_page = cpu_buffer->commit_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct buffer_page *next_page;
 	int ret;
+	u64 ts;
 
 	next_page = tail_page;
 
@@ -2368,25 +2384,24 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_again:
 
-	rb_reset_tail(cpu_buffer, tail_page, tail, length);
+	rb_reset_tail(cpu_buffer, tail, info);
 
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
  out_reset:
 	/* reset write */
-	rb_reset_tail(cpu_buffer, tail_page, tail, length);
+	rb_reset_tail(cpu_buffer, tail, info);
 
 	return NULL;
 }
 
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned long length, u64 ts,
-		  u64 delta, int add_timestamp)
+		  struct rb_event_info *info)
 {
-	struct buffer_page *tail_page;
 	struct ring_buffer_event *event;
+	struct buffer_page *tail_page;
 	unsigned long tail, write;
 
 	/*
@@ -2394,33 +2409,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * hold in the time field of the event, then we append a
 	 * TIME EXTEND event ahead of the data event.
 	 */
-	if (unlikely(add_timestamp))
-		length += RB_LEN_TIME_EXTEND;
+	if (unlikely(info->add_timestamp))
+		info->length += RB_LEN_TIME_EXTEND;
 
-	tail_page = cpu_buffer->tail_page;
-	write = local_add_return(length, &tail_page->write);
+	tail_page = info->tail_page = cpu_buffer->tail_page;
+	write = local_add_return(info->length, &tail_page->write);
 
 	/* set write to only the index of the write */
 	write &= RB_WRITE_MASK;
-	tail = write - length;
+	tail = write - info->length;
 
 	/*
 	 * If this is the first commit on the page, then it has the same
 	 * timestamp as the page itself.
 	 */
 	if (!tail)
-		delta = 0;
+		info->delta = 0;
 
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE))
-		return rb_move_tail(cpu_buffer, length, tail,
-				    tail_page, ts);
+		return rb_move_tail(cpu_buffer, tail, info);
 
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
-	rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
 
@@ -2429,10 +2443,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * its timestamp.
 	 */
 	if (!tail)
-		tail_page->page->time_stamp = ts;
+		tail_page->page->time_stamp = info->ts;
 
 	/* account for these added bytes */
-	local_add(length, &cpu_buffer->entries_bytes);
+	local_add(info->length, &cpu_buffer->entries_bytes);
 
 	return event;
 }
@@ -2521,9 +2535,8 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta;
+	struct rb_event_info info;
 	int nr_loops = 0;
-	int add_timestamp;
 	u64 diff;
 
 	rb_start_commit(cpu_buffer);
@@ -2543,10 +2556,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	}
 #endif
 
-	length = rb_calculate_event_length(length);
+	info.length = rb_calculate_event_length(length);
  again:
-	add_timestamp = 0;
-	delta = 0;
+	info.add_timestamp = 0;
+	info.delta = 0;
 
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
@@ -2560,35 +2573,35 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	ts = rb_time_stamp(cpu_buffer->buffer);
-	diff = ts - cpu_buffer->write_stamp;
+	info.ts = rb_time_stamp(cpu_buffer->buffer);
+	diff = info.ts - cpu_buffer->write_stamp;
 
 	/* make sure this diff is calculated here */
 	barrier();
 
 	/* Did the write stamp get updated already? */
-	if (likely(ts >= cpu_buffer->write_stamp)) {
-		delta = diff;
-		if (unlikely(test_time_stamp(delta))) {
+	if (likely(info.ts >= cpu_buffer->write_stamp)) {
+		info.delta = diff;
+		if (unlikely(test_time_stamp(info.delta))) {
 			int local_clock_stable = 1;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 			local_clock_stable = sched_clock_stable();
 #endif
-			WARN_ONCE(delta > (1ULL << 59),
+			WARN_ONCE(info.delta > (1ULL << 59),
 				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-				  (unsigned long long)delta,
-				  (unsigned long long)ts,
+				  (unsigned long long)info.delta,
+				  (unsigned long long)info.ts,
 				  (unsigned long long)cpu_buffer->write_stamp,
 				  local_clock_stable ? "" :
 				  "If you just came from a suspend/resume,\n"
 				  "please switch to the trace global clock:\n"
 				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-			add_timestamp = 1;
+			info.add_timestamp = 1;
 		}
 	}
 
-	event = __rb_reserve_next(cpu_buffer, length, ts,
-				  delta, add_timestamp);
+	event = __rb_reserve_next(cpu_buffer, &info);
+
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
-- 
cgit v1.2.3


From 9826b2733a4399149072058a11f611357479229d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 May 2015 17:36:45 -0400
Subject: ring-buffer: Move the adding of the extended timestamp out of line

Requiring a extended time stamp is an uncommon occurrence, and it is
best to do it out of line when needed.

Add a noinline function that handles the extended timestamp and
have it called with an unlikely to completely move it out of the
fast path.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ba8f25ffcf6f..a78d4ee4bc58 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2396,6 +2396,29 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+	return true;
+}
+#endif
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct rb_event_info *info)
+{
+	WARN_ONCE(info->delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+		  (unsigned long long)info->delta,
+		  (unsigned long long)info->ts,
+		  (unsigned long long)cpu_buffer->write_stamp,
+		  sched_clock_stable() ? "" :
+		  "If you just came from a suspend/resume,\n"
+		  "please switch to the trace global clock:\n"
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+	info->add_timestamp = 1;
+}
+
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  struct rb_event_info *info)
@@ -2582,22 +2605,8 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	/* Did the write stamp get updated already? */
 	if (likely(info.ts >= cpu_buffer->write_stamp)) {
 		info.delta = diff;
-		if (unlikely(test_time_stamp(info.delta))) {
-			int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-			local_clock_stable = sched_clock_stable();
-#endif
-			WARN_ONCE(info.delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-				  (unsigned long long)info.delta,
-				  (unsigned long long)info.ts,
-				  (unsigned long long)cpu_buffer->write_stamp,
-				  local_clock_stable ? "" :
-				  "If you just came from a suspend/resume,\n"
-				  "please switch to the trace global clock:\n"
-				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-			info.add_timestamp = 1;
-		}
+		if (unlikely(test_time_stamp(info.delta)))
+			rb_handle_timestamp(cpu_buffer, &info);
 	}
 
 	event = __rb_reserve_next(cpu_buffer, &info);
-- 
cgit v1.2.3


From a4543a2fa9ef31d6d0f854a4e14f8f82e7996d8d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 09:40:18 -0400
Subject: ring-buffer: Get timestamp after event is allocated

Move the capturing of the timestamp to after an event is allocated.
If the event is not a commit (where it is an event that preempted
another event), then no timestamp is needed, because the delta of
nested events is always zero.

If the event starts on a new page, no delta needs to be calculated
as the full timestamp will be added to the page header, and the
event will have a delta of zero.

Now if the event requires a time extend (the delta does not fit
in the 27 bit delta slot in the header), then the event is discarded,
the length is extended to hold the TIME_EXTEND event that allows for
a 59 bit delta, and the commit is tried again.

If the event can't be discarded (another event came in after it),
then the TIME_EXTEND is added directly to the allocated event and
the rest of the event is given padding.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 161 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 114 insertions(+), 47 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a78d4ee4bc58..b5ed553e0a45 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2009,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  * and with this, we can determine what to place into the
  * data field.
  */
-static void
+static void __always_inline
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 		struct ring_buffer_event *event,
 		struct rb_event_info *info)
@@ -2017,10 +2017,6 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned length = info->length;
 	u64 delta = info->delta;
 
-	/* Only a commit updates the timestamp */
-	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-		delta = 0;
-
 	/*
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
@@ -2286,6 +2282,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2300,6 +2298,16 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	int ret;
 	u64 ts;
 
+	/*
+	 * If the event had a timestamp attached to it, remove it.
+	 * The first event on a page (nested or not) always uses
+	 * the full timestamp of the new page.
+	 */
+	if (info->add_timestamp) {
+		info->add_timestamp = 0;
+		info->length -= RB_LEN_TIME_EXTEND;
+	}
+
 	next_page = tail_page;
 
 	rb_inc_page(cpu_buffer, &next_page);
@@ -2386,6 +2394,11 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	rb_reset_tail(cpu_buffer, tail, info);
 
+	/* Commit what we have for now to update timestamps */
+	rb_end_commit(cpu_buffer);
+	/* rb_end_commit() decs committing */
+	local_inc(&cpu_buffer->committing);
+
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
@@ -2403,10 +2416,23 @@ static inline bool sched_clock_stable(void)
 }
 #endif
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event);
+static inline void rb_event_discard(struct ring_buffer_event *event);
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event);
+
 static noinline void
 rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct ring_buffer_event *event,
 		    struct rb_event_info *info)
 {
+	struct ring_buffer_event *padding;
+	int length;
+	int size;
+
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
@@ -2416,7 +2442,61 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-	info->add_timestamp = 1;
+
+	/*
+	 * Discarding this event to add a timestamp in front, but
+	 * we still need to update the length of it to perform the discard.
+	 */
+	rb_update_event(cpu_buffer, event, info);
+
+	if (rb_try_to_discard(cpu_buffer, event)) {
+		info->add_timestamp = 1;
+		/*
+		 * The time delta since the last event is too big to
+		 * hold in the time field of the event, then we append a
+		 * TIME EXTEND event ahead of the data event.
+		 */
+		info->length += RB_LEN_TIME_EXTEND;
+		return;
+	}
+
+	/*
+	 * Humpf! An event came in after this one, and because it is not a
+	 * commit, it will have a delta of zero, thus, it will take on
+	 * the timestamp of the previous commit, which happened a long time
+	 * ago (we need to add a timestamp, remember?).
+	 * We need to add the timestamp here. A timestamp is a fixed size
+	 * of 8 bytes. That means the rest of the event needs to be
+	 * padding.
+	 */
+	size = info->length - RB_LEN_TIME_EXTEND;
+
+	/* The padding will have a delta of 1 */
+	if (size)
+		info->delta--;
+
+	padding = rb_add_time_stamp(event, info->delta);
+
+	if (size) {
+		length = info->length;
+		info->delta = 0;
+		info->length = size;
+		rb_update_event(cpu_buffer, padding, info);
+
+		rb_event_discard(padding);
+
+		/* Still visible, need to update write_stamp */
+		rb_update_write_stamp(cpu_buffer, event);
+
+		/* Still need to commit the padding. */
+		rb_end_commit(cpu_buffer);
+
+		/* rb_end_commit() decs committing */
+		local_inc(&cpu_buffer->committing);
+
+		/* The next iteration still uses the original length */
+		info->length = length;
+	}
 }
 
 static struct ring_buffer_event *
@@ -2426,14 +2506,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write;
-
-	/*
-	 * If the time delta since the last event is too big to
-	 * hold in the time field of the event, then we append a
-	 * TIME EXTEND event ahead of the data event.
-	 */
-	if (unlikely(info->add_timestamp))
-		info->length += RB_LEN_TIME_EXTEND;
+	bool is_commit;
 
 	tail_page = info->tail_page = cpu_buffer->tail_page;
 	write = local_add_return(info->length, &tail_page->write);
@@ -2442,31 +2515,42 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	write &= RB_WRITE_MASK;
 	tail = write - info->length;
 
-	/*
-	 * If this is the first commit on the page, then it has the same
-	 * timestamp as the page itself.
-	 */
-	if (!tail)
-		info->delta = 0;
-
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE))
 		return rb_move_tail(cpu_buffer, tail, info);
 
 	/* We reserved something on the buffer */
-
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
-	rb_update_event(cpu_buffer, event, info);
-
-	local_inc(&tail_page->entries);
 
 	/*
-	 * If this is the first commit on the page, then update
-	 * its timestamp.
+	 * If this is the first commit on the page, then it has the same
+	 * timestamp as the page itself, otherwise we need to figure out
+	 * the delta.
 	 */
-	if (!tail)
+	info->ts = rb_time_stamp(cpu_buffer->buffer);
+	is_commit = rb_event_is_commit(cpu_buffer, event);
+
+	/* Commits are special (non nested events) */
+	info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0;
+
+	if (!tail) {
+		/*
+		 * If this is the first commit on the page, set the
+		 * page to its timestamp.
+		 */
 		tail_page->page->time_stamp = info->ts;
+		info->delta = 0;
+
+	} else if (unlikely(test_time_stamp(info->delta)) &&
+		   !info->add_timestamp) {
+		rb_handle_timestamp(cpu_buffer, event, info);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	kmemcheck_annotate_bitfield(event, bitfield);
+	rb_update_event(cpu_buffer, event, info);
+
+	local_inc(&tail_page->entries);
 
 	/* account for these added bytes */
 	local_add(info->length, &cpu_buffer->entries_bytes);
@@ -2560,7 +2644,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
-	u64 diff;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2578,12 +2661,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		return NULL;
 	}
 #endif
-
 	info.length = rb_calculate_event_length(length);
- again:
 	info.add_timestamp = 0;
-	info.delta = 0;
-
+ again:
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
 	 * If one does, it will cause this original code to loop
@@ -2596,19 +2676,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	info.ts = rb_time_stamp(cpu_buffer->buffer);
-	diff = info.ts - cpu_buffer->write_stamp;
-
-	/* make sure this diff is calculated here */
-	barrier();
-
-	/* Did the write stamp get updated already? */
-	if (likely(info.ts >= cpu_buffer->write_stamp)) {
-		info.delta = diff;
-		if (unlikely(test_time_stamp(info.delta)))
-			rb_handle_timestamp(cpu_buffer, &info);
-	}
-
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
-- 
cgit v1.2.3


From 7d75e6833b579adb3de2c7b917de1204eeafea47 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 10:29:10 -0400
Subject: ring-buffer: Make sure event has enough room for extend and padding

Now that events only add time extends after it is committed, in case
an event comes in before it can discard the allocated event, the time
extend needs to be stored within the event. If the event is bigger
than then size needed for the time extend, padding must be added.
The minimum padding size is 8 bytes. Thus if the event is 12 bytes
(size of time extend + 4), there will not be enough room to add both
the time extend and padding. Make sure all events are either 8 bytes
or 16 or more bytes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b5ed553e0a45..781ce359976c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2208,6 +2208,21 @@ static unsigned rb_calculate_event_length(unsigned length)
 	length += RB_EVNT_HDR_SIZE;
 	length = ALIGN(length, RB_ARCH_ALIGNMENT);
 
+	/*
+	 * In case the time delta is larger than the 27 bits for it
+	 * in the header, we need to add a timestamp. If another
+	 * event comes in when trying to discard this one to increase
+	 * the length, then the timestamp will be added in the allocated
+	 * space of this event. If length is bigger than the size needed
+	 * for the TIME_EXTEND, then padding has to be used. The events
+	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+	 * As length is a multiple of 4, we only need to worry if it
+	 * is 12 (RB_LEN_TIME_EXTEND + 4).
+	 */
+	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+		length += RB_ALIGNMENT;
+
 	return length;
 }
 
-- 
cgit v1.2.3


From d90fd77402d3de56a9ca3df04e5d868d0979dc59 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 12:12:27 -0400
Subject: ring-buffer: Reorganize function locations

Functions in ring-buffer.c have gotten interleaved between different
use cases. Move the functions around to get like functions closer
together. This may or may not help gcc keep cache locality, but it
makes it a little easier to work with the code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 920 ++++++++++++++++++++++-----------------------
 1 file changed, 456 insertions(+), 464 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 781ce359976c..1cce0fbf92ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1887,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
 
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-		   struct ring_buffer_event *event)
-{
-	unsigned long addr = (unsigned long)event;
-	unsigned long index;
-
-	index = rb_event_index(event);
-	addr &= PAGE_MASK;
-
-	return cpu_buffer->commit_page->page == (void *)addr &&
-		rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned long max_count;
-
-	/*
-	 * We only race with interrupts and NMIs on this CPU.
-	 * If we own the commit event, then we can commit
-	 * all others that interrupted us, since the interruptions
-	 * are in stack format (they finish before they come
-	 * back to us). This allows us to do a simple loop to
-	 * assign the commit to the tail.
-	 */
- again:
-	max_count = cpu_buffer->nr_pages * 100;
-
-	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-		if (RB_WARN_ON(cpu_buffer, !(--max_count)))
-			return;
-		if (RB_WARN_ON(cpu_buffer,
-			       rb_is_reader_page(cpu_buffer->tail_page)))
-			return;
-		local_set(&cpu_buffer->commit_page->page->commit,
-			  rb_page_write(cpu_buffer->commit_page));
-		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp =
-			cpu_buffer->commit_page->page->time_stamp;
-		/* add barrier to keep gcc from optimizing too much */
-		barrier();
-	}
-	while (rb_commit_index(cpu_buffer) !=
-	       rb_page_write(cpu_buffer->commit_page)) {
-
-		local_set(&cpu_buffer->commit_page->page->commit,
-			  rb_page_write(cpu_buffer->commit_page));
-		RB_WARN_ON(cpu_buffer,
-			   local_read(&cpu_buffer->commit_page->page->commit) &
-			   ~RB_WRITE_MASK);
-		barrier();
-	}
-
-	/* again, keep gcc from optimizing */
-	barrier();
-
-	/*
-	 * If an interrupt came in just after the first while loop
-	 * and pushed the tail page forward, we will be left with
-	 * a dangling commit that will never go forward.
-	 */
-	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
-		goto again;
-}
-
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1979,63 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	iter->head = 0;
 }
 
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
-	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
-	/* Not the first event on the page? */
-	if (rb_event_index(event)) {
-		event->time_delta = delta & TS_MASK;
-		event->array[0] = delta >> TS_SHIFT;
-	} else {
-		/* nope, just zero it */
-		event->time_delta = 0;
-		event->array[0] = 0;
-	}
-
-	return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void __always_inline
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-		struct ring_buffer_event *event,
-		struct rb_event_info *info)
-{
-	unsigned length = info->length;
-	u64 delta = info->delta;
-
-	/*
-	 * If we need to add a timestamp, then we
-	 * add it to the start of the resevered space.
-	 */
-	if (unlikely(info->add_timestamp)) {
-		event = rb_add_time_stamp(event, delta);
-		length -= RB_LEN_TIME_EXTEND;
-		delta = 0;
-	}
-
-	event->time_delta = delta;
-	length -= RB_EVNT_HDR_SIZE;
-	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
-		event->type_len = 0;
-		event->array[0] = length;
-	} else
-		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
 /*
  * rb_handle_head_page - writer hit the head page
  *
@@ -2194,38 +2070,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 	return 0;
 }
 
-static unsigned rb_calculate_event_length(unsigned length)
-{
-	struct ring_buffer_event event; /* Used only for sizeof array */
-
-	/* zero length can cause confusions */
-	if (!length)
-		length++;
-
-	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-		length += sizeof(event.array[0]);
-
-	length += RB_EVNT_HDR_SIZE;
-	length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
-	/*
-	 * In case the time delta is larger than the 27 bits for it
-	 * in the header, we need to add a timestamp. If another
-	 * event comes in when trying to discard this one to increase
-	 * the length, then the timestamp will be added in the allocated
-	 * space of this event. If length is bigger than the size needed
-	 * for the TIME_EXTEND, then padding has to be used. The events
-	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
-	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
-	 * As length is a multiple of 4, we only need to worry if it
-	 * is 12 (RB_LEN_TIME_EXTEND + 4).
-	 */
-	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
-		length += RB_ALIGNMENT;
-
-	return length;
-}
-
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	      unsigned long tail, struct rb_event_info *info)
@@ -2424,71 +2268,471 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 {
-	return true;
-}
-#endif
+	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
-static inline int
-rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
-		  struct ring_buffer_event *event);
-static inline void rb_event_discard(struct ring_buffer_event *event);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event);
+	/* Not the first event on the page? */
+	if (rb_event_index(event)) {
+		event->time_delta = delta & TS_MASK;
+		event->array[0] = delta >> TS_SHIFT;
+	} else {
+		/* nope, just zero it */
+		event->time_delta = 0;
+		event->array[0] = 0;
+	}
 
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct ring_buffer_event *event,
-		    struct rb_event_info *info)
-{
-	struct ring_buffer_event *padding;
-	int length;
-	int size;
+	return skip_time_extend(event);
+}
 
-	WARN_ONCE(info->delta > (1ULL << 59),
-		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-		  (unsigned long long)info->delta,
-		  (unsigned long long)info->ts,
-		  (unsigned long long)cpu_buffer->write_stamp,
-		  sched_clock_stable() ? "" :
-		  "If you just came from a suspend/resume,\n"
-		  "please switch to the trace global clock:\n"
-		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void __always_inline
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+		struct ring_buffer_event *event,
+		struct rb_event_info *info)
+{
+	unsigned length = info->length;
+	u64 delta = info->delta;
 
 	/*
-	 * Discarding this event to add a timestamp in front, but
-	 * we still need to update the length of it to perform the discard.
+	 * If we need to add a timestamp, then we
+	 * add it to the start of the resevered space.
 	 */
-	rb_update_event(cpu_buffer, event, info);
-
-	if (rb_try_to_discard(cpu_buffer, event)) {
-		info->add_timestamp = 1;
-		/*
-		 * The time delta since the last event is too big to
-		 * hold in the time field of the event, then we append a
-		 * TIME EXTEND event ahead of the data event.
-		 */
-		info->length += RB_LEN_TIME_EXTEND;
-		return;
+	if (unlikely(info->add_timestamp)) {
+		event = rb_add_time_stamp(event, delta);
+		length -= RB_LEN_TIME_EXTEND;
+		delta = 0;
 	}
 
-	/*
-	 * Humpf! An event came in after this one, and because it is not a
-	 * commit, it will have a delta of zero, thus, it will take on
-	 * the timestamp of the previous commit, which happened a long time
-	 * ago (we need to add a timestamp, remember?).
-	 * We need to add the timestamp here. A timestamp is a fixed size
-	 * of 8 bytes. That means the rest of the event needs to be
-	 * padding.
-	 */
-	size = info->length - RB_LEN_TIME_EXTEND;
-
-	/* The padding will have a delta of 1 */
-	if (size)
-		info->delta--;
+	event->time_delta = delta;
+	length -= RB_EVNT_HDR_SIZE;
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+		event->type_len = 0;
+		event->array[0] = length;
+	} else
+		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
+
+static unsigned rb_calculate_event_length(unsigned length)
+{
+	struct ring_buffer_event event; /* Used only for sizeof array */
+
+	/* zero length can cause confusions */
+	if (!length)
+		length++;
+
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+		length += sizeof(event.array[0]);
+
+	length += RB_EVNT_HDR_SIZE;
+	length = ALIGN(length, RB_ARCH_ALIGNMENT);
+
+	/*
+	 * In case the time delta is larger than the 27 bits for it
+	 * in the header, we need to add a timestamp. If another
+	 * event comes in when trying to discard this one to increase
+	 * the length, then the timestamp will be added in the allocated
+	 * space of this event. If length is bigger than the size needed
+	 * for the TIME_EXTEND, then padding has to be used. The events
+	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+	 * As length is a multiple of 4, we only need to worry if it
+	 * is 12 (RB_LEN_TIME_EXTEND + 4).
+	 */
+	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+		length += RB_ALIGNMENT;
+
+	return length;
+}
+
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+	return true;
+}
+#endif
+
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_ts_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		unsigned long write_mask =
+			local_read(&bpage->write) & ~RB_WRITE_MASK;
+		unsigned long event_length = rb_event_length(event);
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		old_index += write_mask;
+		new_index += write_mask;
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index) {
+			/* update counters */
+			local_sub(event_length, &cpu_buffer->entries_bytes);
+			return 1;
+		}
+	}
+
+	/* could not discard */
+	return 0;
+}
+
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	local_inc(&cpu_buffer->committing);
+	local_inc(&cpu_buffer->commits);
+}
+
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long max_count;
+
+	/*
+	 * We only race with interrupts and NMIs on this CPU.
+	 * If we own the commit event, then we can commit
+	 * all others that interrupted us, since the interruptions
+	 * are in stack format (they finish before they come
+	 * back to us). This allows us to do a simple loop to
+	 * assign the commit to the tail.
+	 */
+ again:
+	max_count = cpu_buffer->nr_pages * 100;
+
+	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+		if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+			return;
+		if (RB_WARN_ON(cpu_buffer,
+			       rb_is_reader_page(cpu_buffer->tail_page)))
+			return;
+		local_set(&cpu_buffer->commit_page->page->commit,
+			  rb_page_write(cpu_buffer->commit_page));
+		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
+		/* add barrier to keep gcc from optimizing too much */
+		barrier();
+	}
+	while (rb_commit_index(cpu_buffer) !=
+	       rb_page_write(cpu_buffer->commit_page)) {
+
+		local_set(&cpu_buffer->commit_page->page->commit,
+			  rb_page_write(cpu_buffer->commit_page));
+		RB_WARN_ON(cpu_buffer,
+			   local_read(&cpu_buffer->commit_page->page->commit) &
+			   ~RB_WRITE_MASK);
+		barrier();
+	}
+
+	/* again, keep gcc from optimizing */
+	barrier();
+
+	/*
+	 * If an interrupt came in just after the first while loop
+	 * and pushed the tail page forward, we will be left with
+	 * a dangling commit that will never go forward.
+	 */
+	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+		goto again;
+}
+
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long commits;
+
+	if (RB_WARN_ON(cpu_buffer,
+		       !local_read(&cpu_buffer->committing)))
+		return;
+
+ again:
+	commits = local_read(&cpu_buffer->commits);
+	/* synchronize with interrupts */
+	barrier();
+	if (local_read(&cpu_buffer->committing) == 1)
+		rb_set_commit_to_write(cpu_buffer);
+
+	local_dec(&cpu_buffer->committing);
+
+	/* synchronize with interrupts */
+	barrier();
+
+	/*
+	 * Need to account for interrupts coming in between the
+	 * updating of the commit page and the clearing of the
+	 * committing counter.
+	 */
+	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+	    !local_read(&cpu_buffer->committing)) {
+		local_inc(&cpu_buffer->committing);
+		goto again;
+	}
+}
+
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+		event = skip_time_extend(event);
+
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct ring_buffer_event *event)
+{
+	unsigned long addr = (unsigned long)event;
+	unsigned long index;
+
+	index = rb_event_index(event);
+	addr &= PAGE_MASK;
+
+	return cpu_buffer->commit_page->page == (void *)addr &&
+		rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event)
+{
+	u64 delta;
+
+	/*
+	 * The event first in the commit queue updates the
+	 * time stamp.
+	 */
+	if (rb_event_is_commit(cpu_buffer, event)) {
+		/*
+		 * A commit event that is first on a page
+		 * updates the write timestamp with the page stamp
+		 */
+		if (!rb_event_index(event))
+			cpu_buffer->write_stamp =
+				cpu_buffer->commit_page->page->time_stamp;
+		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+			delta = event->array[0];
+			delta <<= TS_SHIFT;
+			delta += event->time_delta;
+			cpu_buffer->write_stamp += delta;
+		} else
+			cpu_buffer->write_stamp += event->time_delta;
+	}
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event)
+{
+	local_inc(&cpu_buffer->entries);
+	rb_update_write_stamp(cpu_buffer, event);
+	rb_end_commit(cpu_buffer);
+}
+
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+	bool pagebusy;
+
+	if (buffer->irq_work.waiters_pending) {
+		buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&buffer->irq_work.work);
+	}
+
+	if (cpu_buffer->irq_work.waiters_pending) {
+		cpu_buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+
+	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+
+	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+		cpu_buffer->irq_work.wakeup_full = true;
+		cpu_buffer->irq_work.full_waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+}
+
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+
+static __always_inline int
+trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned int val = cpu_buffer->current_context;
+	int bit;
+
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = RB_CTX_NMI;
+		else if (in_irq())
+			bit = RB_CTX_IRQ;
+		else
+			bit = RB_CTX_SOFTIRQ;
+	} else
+		bit = RB_CTX_NORMAL;
+
+	if (unlikely(val & (1 << bit)))
+		return 1;
+
+	val |= (1 << bit);
+	cpu_buffer->current_context = val;
+
+	return 0;
+}
+
+static __always_inline void
+trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+			      struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu = raw_smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	rb_commit(cpu_buffer, event);
+
+	rb_wakeups(buffer, cpu_buffer);
+
+	trace_recursive_unlock(cpu_buffer);
+
+	preempt_enable_notrace();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct ring_buffer_event *event,
+		    struct rb_event_info *info)
+{
+	struct ring_buffer_event *padding;
+	int length;
+	int size;
+
+	WARN_ONCE(info->delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+		  (unsigned long long)info->delta,
+		  (unsigned long long)info->ts,
+		  (unsigned long long)cpu_buffer->write_stamp,
+		  sched_clock_stable() ? "" :
+		  "If you just came from a suspend/resume,\n"
+		  "please switch to the trace global clock:\n"
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+
+	/*
+	 * Discarding this event to add a timestamp in front, but
+	 * we still need to update the length of it to perform the discard.
+	 */
+	rb_update_event(cpu_buffer, event, info);
+
+	if (rb_try_to_discard(cpu_buffer, event)) {
+		info->add_timestamp = 1;
+		/*
+		 * The time delta since the last event is too big to
+		 * hold in the time field of the event, then we append a
+		 * TIME EXTEND event ahead of the data event.
+		 */
+		info->length += RB_LEN_TIME_EXTEND;
+		return;
+	}
+
+	/*
+	 * Humpf! An event came in after this one, and because it is not a
+	 * commit, it will have a delta of zero, thus, it will take on
+	 * the timestamp of the previous commit, which happened a long time
+	 * ago (we need to add a timestamp, remember?).
+	 * We need to add the timestamp here. A timestamp is a fixed size
+	 * of 8 bytes. That means the rest of the event needs to be
+	 * padding.
+	 */
+	size = info->length - RB_LEN_TIME_EXTEND;
+
+	/* The padding will have a delta of 1 */
+	if (size)
+		info->delta--;
 
 	padding = rb_add_time_stamp(event, info->delta);
 
@@ -2573,84 +2817,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static inline int
-rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
-		  struct ring_buffer_event *event)
-{
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
-
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_ts_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
-		unsigned long write_mask =
-			local_read(&bpage->write) & ~RB_WRITE_MASK;
-		unsigned long event_length = rb_event_length(event);
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		old_index += write_mask;
-		new_index += write_mask;
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index) {
-			/* update counters */
-			local_sub(event_length, &cpu_buffer->entries_bytes);
-			return 1;
-		}
-	}
-
-	/* could not discard */
-	return 0;
-}
-
-static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	local_inc(&cpu_buffer->committing);
-	local_inc(&cpu_buffer->commits);
-}
-
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned long commits;
-
-	if (RB_WARN_ON(cpu_buffer,
-		       !local_read(&cpu_buffer->committing)))
-		return;
-
- again:
-	commits = local_read(&cpu_buffer->commits);
-	/* synchronize with interrupts */
-	barrier();
-	if (local_read(&cpu_buffer->committing) == 1)
-		rb_set_commit_to_write(cpu_buffer);
-
-	local_dec(&cpu_buffer->committing);
-
-	/* synchronize with interrupts */
-	barrier();
-
-	/*
-	 * Need to account for interrupts coming in between the
-	 * updating of the commit page and the clearing of the
-	 * committing counter.
-	 */
-	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
-	    !local_read(&cpu_buffer->committing)) {
-		local_inc(&cpu_buffer->committing);
-		goto again;
-	}
-}
-
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer *buffer,
 		      struct ring_buffer_per_cpu *cpu_buffer,
@@ -2706,75 +2872,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	return NULL;
 }
 
-/*
- * The lock and unlock are done within a preempt disable section.
- * The current_context per_cpu variable can only be modified
- * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
- *
- *  bit 0 =  NMI context
- *  bit 1 =  IRQ context
- *  bit 2 =  SoftIRQ context
- *  bit 3 =  normal context.
- *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- *  101 - 1 = 100
- *  101 & 100 = 100 (clearing bit zero)
- *
- *  1010 - 1 = 1001
- *  1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
- */
-
-static __always_inline int
-trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned int val = cpu_buffer->current_context;
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = RB_CTX_NMI;
-		else if (in_irq())
-			bit = RB_CTX_IRQ;
-		else
-			bit = RB_CTX_SOFTIRQ;
-	} else
-		bit = RB_CTX_NORMAL;
-
-	if (unlikely(val & (1 << bit)))
-		return 1;
-
-	val |= (1 << bit);
-	cpu_buffer->current_context = val;
-
-	return 0;
-}
-
-static __always_inline void
-trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
-}
-
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2833,111 +2930,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event)
-{
-	u64 delta;
-
-	/*
-	 * The event first in the commit queue updates the
-	 * time stamp.
-	 */
-	if (rb_event_is_commit(cpu_buffer, event)) {
-		/*
-		 * A commit event that is first on a page
-		 * updates the write timestamp with the page stamp
-		 */
-		if (!rb_event_index(event))
-			cpu_buffer->write_stamp =
-				cpu_buffer->commit_page->page->time_stamp;
-		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-			delta = event->array[0];
-			delta <<= TS_SHIFT;
-			delta += event->time_delta;
-			cpu_buffer->write_stamp += delta;
-		} else
-			cpu_buffer->write_stamp += event->time_delta;
-	}
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event)
-{
-	local_inc(&cpu_buffer->entries);
-	rb_update_write_stamp(cpu_buffer, event);
-	rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
-	bool pagebusy;
-
-	if (buffer->irq_work.waiters_pending) {
-		buffer->irq_work.waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&buffer->irq_work.work);
-	}
-
-	if (cpu_buffer->irq_work.waiters_pending) {
-		cpu_buffer->irq_work.waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&cpu_buffer->irq_work.work);
-	}
-
-	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
-	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-		cpu_buffer->irq_work.wakeup_full = true;
-		cpu_buffer->irq_work.full_waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&cpu_buffer->irq_work.work);
-	}
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	int cpu = raw_smp_processor_id();
-
-	cpu_buffer = buffer->buffers[cpu];
-
-	rb_commit(cpu_buffer, event);
-
-	rb_wakeups(buffer, cpu_buffer);
-
-	trace_recursive_unlock(cpu_buffer);
-
-	preempt_enable_notrace();
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-		event = skip_time_extend(event);
-
-	/* array[0] holds the actual length for the discarded event */
-	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
-	event->type_len = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 /*
  * Decrement the entries to the page that an event is on.
  * The event does not even need to exist, only the pointer
-- 
cgit v1.2.3


From 72ac426a5bb0cec572d26b4456f8c1e14601694e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 16 Jul 2015 13:24:54 -0400
Subject: tracing: Clean up stack tracing and fix fentry updates

Akashi Takahiro was porting the stack tracer to arm64 and found some
issues with it. One was that it repeats the top function, due to the
stack frame added by the mcount caller and added by itself. This
was added when fentry came in, and before fentry created its own stack
frame. But x86's fentry now creates its own stack frame, and there's
no need to insert the function again.

This also cleans up the code a bit, where it doesn't need to do something
special for fentry, and doesn't include insertion of a duplicate
entry for the called function being traced.

Link: http://lkml.kernel.org/r/55A646EE.6030402@linaro.org

Some-suggestions-by: Jungseok Lee <jungseoklee85@gmail.com>
Some-suggestions-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 68 ++++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 45 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..b746399ab59c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,12 +18,6 @@
 
 #define STACK_TRACE_ENTRIES 500
 
-#ifdef CC_USING_FENTRY
-# define fentry		1
-#else
-# define fentry		0
-#endif
-
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
 	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
  */
 static struct stack_trace max_stack_trace = {
 	.max_entries		= STACK_TRACE_ENTRIES - 1,
-	.entries		= &stack_dump_trace[1],
+	.entries		= &stack_dump_trace[0],
 };
 
 static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
 
 	pr_emerg("        Depth    Size   Location    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries - 1);
+			   max_stack_trace.nr_entries);
 
 	for (i = 0; i < max_stack_trace.nr_entries; i++) {
 		if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 	unsigned long this_size, flags; unsigned long *p, *top, *start;
 	static int tracer_frame;
 	int frame_size = ACCESS_ONCE(tracer_frame);
-	int i;
+	int i, x;
 
 	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
 	this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
 	max_stack_size = this_size;
 
 	max_stack_trace.nr_entries = 0;
-
-	if (using_ftrace_ops_list_func())
-		max_stack_trace.skip = 4;
-	else
-		max_stack_trace.skip = 3;
+	max_stack_trace.skip = 3;
 
 	save_stack_trace(&max_stack_trace);
 
-	/*
-	 * Add the passed in ip from the function tracer.
-	 * Searching for this on the stack will skip over
-	 * most of the overhead from the stack tracer itself.
-	 */
-	stack_dump_trace[0] = ip;
-	max_stack_trace.nr_entries++;
+	/* Skip over the overhead of the stack tracer itself */
+	for (i = 0; i < max_stack_trace.nr_entries; i++) {
+		if (stack_dump_trace[i] == ip)
+			break;
+	}
 
 	/*
 	 * Now find where in the stack these are.
 	 */
-	i = 0;
+	x = 0;
 	start = stack;
 	top = (unsigned long *)
 		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
 	while (i < max_stack_trace.nr_entries) {
 		int found = 0;
 
-		stack_dump_index[i] = this_size;
+		stack_dump_index[x] = this_size;
 		p = start;
 
 		for (; p < top && i < max_stack_trace.nr_entries; p++) {
+			if (stack_dump_trace[i] == ULONG_MAX)
+				break;
 			if (*p == stack_dump_trace[i]) {
-				this_size = stack_dump_index[i++] =
+				stack_dump_trace[x] = stack_dump_trace[i++];
+				this_size = stack_dump_index[x++] =
 					(top - p) * sizeof(unsigned long);
 				found = 1;
 				/* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 				 * out what that is, then figure it out
 				 * now.
 				 */
-				if (unlikely(!tracer_frame) && i == 1) {
+				if (unlikely(!tracer_frame)) {
 					tracer_frame = (p - stack) *
 						sizeof(unsigned long);
 					max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
 			i++;
 	}
 
+	max_stack_trace.nr_entries = x;
+	for (; x < i; x++)
+		stack_dump_trace[x] = ULONG_MAX;
+
 	if (task_stack_end_corrupted(current)) {
 		print_max_stack();
 		BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (per_cpu(trace_active, cpu)++ != 0)
 		goto out;
 
-	/*
-	 * When fentry is used, the traced function does not get
-	 * its stack frame set up, and we lose the parent.
-	 * The ip is pretty useless because the function tracer
-	 * was called before that function set up its stack frame.
-	 * In this case, we use the parent ip.
-	 *
-	 * By adding the return address of either the parent ip
-	 * or the current ip we can disregard most of the stack usage
-	 * caused by the stack tracer itself.
-	 *
-	 * The function tracer always reports the address of where the
-	 * mcount call was, but the stack will hold the return address.
-	 */
-	if (fentry)
-		ip = parent_ip;
-	else
-		ip += MCOUNT_INSN_SIZE;
+	ip += MCOUNT_INSN_SIZE;
 
 	check_stack(ip, &stack);
 
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
 	long n = *pos - 1;
 
-	if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+	if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
 		return NULL;
 
 	m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries - 1);
+			   max_stack_trace.nr_entries);
 
 		if (!stack_tracer_enabled && !max_stack_size)
 			print_disabled(m);
-- 
cgit v1.2.3


From 8e436ca042d904533a1e14fdc85f0facdfca752f Mon Sep 17 00:00:00 2001
From: Umesh Tiwari <umesh.t@samsung.com>
Date: Mon, 22 Jun 2015 16:58:08 +0530
Subject: ftrace: add tracing_thresh to function profile

This patch extends tracing_thresh functionality to function profile tracer.
If tracing_thresh is set, print those entries only,
whose average is > tracing thresh.

Link: http://lkml.kernel.org/r/1434972488-8571-1-git-send-email-umesh.t@samsung.com

Signed-off-by: Umesh Tiwari <umesh.t@samsung.com>
[ Removed unnecessary 'moved' comment ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02bece4a99ea..f46dbb5cdf76 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -613,13 +613,18 @@ static int function_stat_show(struct seq_file *m, void *v)
 		goto out;
 	}
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	avg = rec->time;
+	do_div(avg, rec->counter);
+	if (tracing_thresh && (avg < tracing_thresh))
+		goto out;
+#endif
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_puts(m, "    ");
-	avg = rec->time;
-	do_div(avg, rec->counter);
 
 	/* Sample standard deviation (s^2) */
 	if (rec->counter <= 1)
-- 
cgit v1.2.3


From 82c355e81afbf16bc1ab379899a79eb66e2b7504 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 16 Jul 2015 21:58:52 -0400
Subject: ftrace: Fix function_graph duration spacing with 7-digits

Jungseok Lee noticed the following:

Currently, row's width of 7-digit duration numbers not aligned with
other cases like the following example.

 3) $ 3999884 us |      }
 3)               |      finish_task_switch() {
 3)   0.365 us    |        _raw_spin_unlock_irq();
 3)   3.333 us    |      }
 3) $ 3999976 us |    }
 3) $ 3999979 us |  } /* schedule */

As adding a single white space in case of 7-digit numbers, the format
could be unified easily as follows.

 3) $ 2237472 us  |      }
 3)               |      finish_task_switch() {
 3)   0.364 us    |        _raw_spin_unlock_irq();
 3)   3.125 us    |      }
 3) $ 2237556 us  |    }
 3) $ 2237559 us  |  } /* schedule */

Instead of making a special case for 7-digit numbers, the logic
of the len and the space loop is slightly modified to make the
two cases have the same format.

Link: http://lkml.kernel.org/r/1436626300-1679-2-git-send-email-jungseoklee85@gmail.com

Reported-by: Jungseok Lee <jungseoklee85@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..ca98445782ac 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 		snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
 		trace_seq_printf(s, ".%s", nsecs_str);
-		len += strlen(nsecs_str);
+		len += strlen(nsecs_str) + 1;
 	}
 
 	trace_seq_puts(s, " us ");
 
 	/* Print remaining spaces to fit the row's width */
-	for (i = len; i < 7; i++)
+	for (i = len; i < 8; i++)
 		trace_seq_putc(s, ' ');
 }
 
-- 
cgit v1.2.3


From b838e1d96c613019095ba008afbee800977b0582 Mon Sep 17 00:00:00 2001
From: Jungseok Lee <jungseoklee85@gmail.com>
Date: Sat, 11 Jul 2015 14:51:40 +0000
Subject: tracing: Introduce two additional marks for delay

A fine granulity support for delay would be very useful when profiling
VM logics, such as page allocation including page reclaim and memory
compaction with function graph.

Thus, this patch adds two additional marks with two changes.

 - An equal sign in mark selection function is removed to align code
   behavior with comments and documentation.

 - The function graph example related to delay in ftrace.txt is updated
   to cover all supported marks.

Link: http://lkml.kernel.org/r/1436626300-1679-3-git-send-email-jungseoklee85@gmail.com

Cc: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Jungseok Lee <jungseoklee85@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..8e481a84aeea 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -496,6 +496,8 @@ static const struct trace_mark {
 	char			sym;
 } mark[] = {
 	MARK(1000000000ULL	, '$'), /* 1 sec */
+	MARK(100000000ULL	, '@'), /* 100 msec */
+	MARK(10000000ULL	, '*'), /* 10 msec */
 	MARK(1000000ULL		, '#'), /* 1000 usecs */
 	MARK(100000ULL		, '!'), /* 100 usecs */
 	MARK(10000ULL		, '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
 	int size = ARRAY_SIZE(mark);
 
 	for (i = 0; i < size; i++) {
-		if (d >= mark[i].val)
+		if (d > mark[i].val)
 			break;
 	}
 
-- 
cgit v1.2.3


From 4246a0b63bd8f56a1469b12eafeb875b1041a451 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Jul 2015 15:29:37 +0200
Subject: block: add a bi_error field to struct bio

Currently we have two different ways to signal an I/O error on a BIO:

 (1) by clearing the BIO_UPTODATE flag
 (2) by returning a Linux errno value to the bi_end_io callback

The first one has the drawback of only communicating a single possible
error (-EIO), and the second one has the drawback of not beeing persistent
when bios are queued up, and are not passed along from child to parent
bio in the ever more popular chaining scenario.  Having both mechanisms
available has the additional drawback of utterly confusing driver authors
and introducing bugs where various I/O submitters only deal with one of
them, and the others have to add boilerplate code to deal with both kinds
of error returns.

So add a new bi_error field to store an errno value directly in struct
bio and remove the existing mechanisms to clean all this up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	if (!error && !bio_flagged(bio, BIO_UPTODATE))
-		error = EIO;
-
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio->bi_rw, what, error, 0, NULL);
 }
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-				!bio_flagged(bio, BIO_UPTODATE),
-				sizeof(rpdu), &rpdu);
+				bio->bi_error, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+			bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			sizeof(r), &r);
 }
 
 /**
-- 
cgit v1.2.3


From fbd705a0c6184580d0e2fbcbd47a37b6e5822511 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Jun 2015 11:13:36 +0200
Subject: sched: Introduce the 'trace_sched_waking' tracepoint

Mathieu reported that since 317f394160e9 ("sched: Move the second half
of ttwu() to the remote cpu") trace_sched_wakeup() can happen out of
context of the waker.

This is a problem when you want to analyse wakeup paths because it is
now very hard to correlate the wakeup event to whoever issued the
wakeup.

OTOH trace_sched_wakeup() is issued at the point where we set
p->state = TASK_RUNNING, which is right were we hand the task off to
the scheduler, so this is an important point when looking at
scheduling behaviour, up to here its been the wakeup path everything
hereafter is due to scheduler policy.

To bridge this gap, introduce a second tracepoint: trace_sched_waking.
It is guaranteed to be called in the waker context.

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Francis Giraldeau <francis.giraldeau@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150609091336.GQ3644@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..f270088e9929 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 }
 
 static void
-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
 {
 	if (unlikely(!sched_ref))
 		return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9b33dd117f3f..12cbe77b4136 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(void *ignore, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
-- 
cgit v1.2.3


From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 1 Jul 2015 02:13:50 +0000
Subject: tracing, perf: Implement BPF programs attached to uprobes

By copying BPF related operation to uprobe processing path, this patch
allow users attach BPF programs to uprobes like what they are already
doing on kprobes.

After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a
uprobe perf event. Which make it possible to profile user space programs
and kernel events together using BPF.

Because of this patch, CONFIG_BPF_EVENTS should be selected by
CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if
KPROBE_EVENT is not set.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/Kconfig        | 2 +-
 kernel/trace/trace_uprobe.c | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
 
 config BPF_EVENTS
 	depends on BPF_SYSCALL
-	depends on KPROBE_EVENT
+	depends on KPROBE_EVENT || UPROBE_EVENT
 	bool
 	default y
 	help
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..f97479f1ce35 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
+	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 
 	size = esize + tu->tp.size + dsize;
@@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
+	call->flags = TRACE_EVENT_FL_UPROBE;
 	call->class->reg = trace_uprobe_register;
 	call->data = tu;
 	ret = trace_add_event_call(call);
-- 
cgit v1.2.3


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'kernel/trace')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..ef9936df1b04 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 9f61668073a8d80650622e792aff876db9ca23c6 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Mon, 10 Aug 2015 14:35:46 +0200
Subject: tracing: Allow triggers to filter for CPU ids and process names

By extending the filter rules by more generic fields
we can write triggers filters like

  echo 'stacktrace if cpu == 1' > \
	/sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger

or

  echo 'stacktrace if comm == sshd'  > \
	/sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger

CPU and COMM are not part of struct trace_entry. We could add the two
new fields to ftrace_common_field list and fix up all depending
sides. But that looks pretty ugly. Another thing I would like to
avoid that the 'format' file contents changes.

All this can be avoided by introducing another list which contains
non field members of struct trace_entry.

Link: http://lkml.kernel.org/r/1439210146-24707-1-git-send-email-daniel.wagner@bmw-carit.de

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c        | 25 ++++++++++++++++++
 kernel/trace/trace_events_filter.c | 54 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..7ca09cdc20c2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -30,6 +30,7 @@
 DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
 static LIST_HEAD(ftrace_common_fields);
 
 #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
 	struct ftrace_event_field *field;
 	struct list_head *head;
 
+	field = __find_event_field(&ftrace_generic_fields, name);
+	if (field)
+		return field;
+
 	field = __find_event_field(&ftrace_common_fields, name);
 	if (field)
 		return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#define __generic_field(type, item, filter_type)			\
+	ret = __trace_define_field(&ftrace_generic_fields, #type,	\
+				   #item, 0, 0, is_signed_type(type),	\
+				   filter_type);			\
+	if (ret)							\
+		return ret;
+
 #define __common_field(type, item)					\
 	ret = __trace_define_field(&ftrace_common_fields, #type,	\
 				   "common_" #item,			\
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
 	if (ret)							\
 		return ret;
 
+static int trace_define_generic_fields(void)
+{
+	int ret;
+
+	__generic_field(int, cpu, FILTER_OTHER);
+	__generic_field(char *, comm, FILTER_PTR_STRING);
+
+	return ret;
+}
+
 static int trace_define_common_fields(void)
 {
 	int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
 	if (!entry)
 		pr_warn("Could not create tracefs 'available_events' entry\n");
 
+	if (trace_define_generic_fields())
+		pr_warn("tracing: Failed to allocated generic fields");
+
 	if (trace_define_common_fields())
 		pr_warn("tracing: Failed to allocate common fields");
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..bd1bf184c5c9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
 	return match;
 }
 
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+	int cpu, cmp;
+	int match = 0;
+
+	cpu = raw_smp_processor_id();
+	cmp = pred->val;
+
+	switch (pred->op) {
+	case OP_EQ:
+		match = cpu == cmp;
+		break;
+	case OP_LT:
+		match = cpu < cmp;
+		break;
+	case OP_LE:
+		match = cpu <= cmp;
+		break;
+	case OP_GT:
+		match = cpu > cmp;
+		break;
+	case OP_GE:
+		match = cpu >= cmp;
+		break;
+	default:
+		break;
+	}
+
+	return !!match == !pred->not;
+}
+
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+	int cmp, match;
+
+	cmp = pred->regex.match(current->comm, &pred->regex,
+				pred->regex.field_len);
+	match = cmp ^ pred->not;
+
+	return match;
+}
+
 static int filter_pred_none(struct filter_pred *pred, void *event)
 {
 	return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
 	if (is_string_field(field)) {
 		filter_build_regex(pred);
 
-		if (field->filter_type == FILTER_STATIC_STRING) {
+		if (!strcmp(field->name, "comm")) {
+			fn = filter_pred_comm;
+			pred->regex.field_len = TASK_COMM_LEN;
+		} else if (field->filter_type == FILTER_STATIC_STRING) {
 			fn = filter_pred_string;
 			pred->regex.field_len = field->size;
 		} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
 		}
 		pred->val = val;
 
-		fn = select_comparison_fn(pred->op, field->size,
+		if (!strcmp(field->name, "cpu"))
+			fn = filter_pred_cpu;
+		else
+			fn = select_comparison_fn(pred->op, field->size,
 					  field->is_signed);
 		if (!fn) {
 			parse_error(ps, FILT_ERR_INVALID_OP, 0);
-- 
cgit v1.2.3


From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 26 Aug 2015 10:57:46 +0000
Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0

When manually added uprobe point with zero address, 'uprobe_events'
output '(null)' instead of 0x00000000:

  # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \
            /sys/kernel/debug/tracing/uprobe_events

  # cat /sys/kernel/debug/tracing/uprobe_events
    p:probe_libc/abs_0 /path/to/lib.bin:0x          (null) arg1=%ax

 This patch fixes this behavior:

  # cat /sys/kernel/debug/tracing/uprobe_events
  p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_uprobe.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f97479f1ce35..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
 			trace_event_name(&tu->tp.call));
-	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+	seq_printf(m, " %s:", tu->filename);
+
+	/* Don't print "0x  (null)" when offset is 0 */
+	if (tu->offset) {
+		seq_printf(m, "0x%p", (void *)tu->offset);
+	} else {
+		switch (sizeof(void *)) {
+		case 4:
+			seq_printf(m, "0x00000000");
+			break;
+		case 8:
+		default:
+			seq_printf(m, "0x0000000000000000");
+			break;
+		}
+	}
 
 	for (i = 0; i < tu->tp.nr_args; i++)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
-- 
cgit v1.2.3


From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:22 -0700
Subject: lib: introduce strncpy_from_unsafe()

generalize FETCH_FUNC_NAME(memory, string) into
strncpy_from_unsafe() and fix sparse warnings that were
present in original implementation.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/trace_kprobe.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 					    void *addr, void *dest)
 {
-	long ret;
 	int maxlen = get_rloc_len(*(u32 *)dest);
 	u8 *dst = get_rloc_data(dest);
-	u8 *src = addr;
-	mm_segment_t old_fs = get_fs();
+	long ret;
 
 	if (!maxlen)
 		return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 	 * Try to get string again, since the string can be changed while
 	 * probing.
 	 */
-	set_fs(KERNEL_DS);
-	pagefault_disable();
-
-	do
-		ret = __copy_from_user_inatomic(dst++, src++, 1);
-	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
-	dst[-1] = '\0';
-	pagefault_enable();
-	set_fs(old_fs);
+	ret = strncpy_from_unsafe(dst, addr, maxlen);
 
 	if (ret < 0) {	/* Failed to fetch string */
-		((u8 *)get_rloc_data(dest))[0] = '\0';
+		dst[0] = '\0';
 		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
 	} else {
-		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-					      get_rloc_offs(*(u32 *)dest));
+		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
 	}
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
-- 
cgit v1.2.3


From 8d3b7dce8622919da5c5822ef7338d6604c9fe6e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:23 -0700
Subject: bpf: add support for %s specifier to bpf_trace_printk()

%s specifier makes bpf program and kernel debugging easier.
To make sure that trace_printk won't crash the unsafe string
is copied into stack and unsafe pointer is substituted.

The following C program:
 #include <linux/fs.h>
int foo(struct pt_regs *ctx, struct filename *filename)
{
  void *name = 0;

  bpf_probe_read(&name, sizeof(name), &filename->name);
  bpf_trace_printk("executed %s\n", name);
  return 0;
}

when attached to kprobe do_execve()
will produce output in /sys/kernel/debug/tracing/trace_pipe :
    make-13492 [002] d..1  3250.997277: : executed /bin/sh
      sh-13493 [004] d..1  3250.998716: : executed /usr/bin/gcc
     gcc-13494 [002] d..1  3250.999822: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/cc1
     gcc-13495 [002] d..1  3251.006731: : executed /usr/bin/as
     gcc-13496 [002] d..1  3251.011831: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/collect2
collect2-13497 [000] d..1  3251.012941: : executed /usr/bin/ld

Suggested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ef9936df1b04..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 
 /*
  * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
  */
 static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 {
 	char *fmt = (char *) (long) r1;
+	bool str_seen = false;
 	int mod[3] = {};
 	int fmt_cnt = 0;
+	u64 unsafe_addr;
+	char buf[64];
 	int i;
 
 	/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 		if (fmt[i] == 'l') {
 			mod[fmt_cnt]++;
 			i++;
-		} else if (fmt[i] == 'p') {
+		} else if (fmt[i] == 'p' || fmt[i] == 's') {
 			mod[fmt_cnt]++;
 			i++;
 			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
 				return -EINVAL;
 			fmt_cnt++;
+			if (fmt[i - 1] == 's') {
+				if (str_seen)
+					/* allow only one '%s' per fmt string */
+					return -EINVAL;
+				str_seen = true;
+
+				switch (fmt_cnt) {
+				case 1:
+					unsafe_addr = r3;
+					r3 = (long) buf;
+					break;
+				case 2:
+					unsafe_addr = r4;
+					r4 = (long) buf;
+					break;
+				case 3:
+					unsafe_addr = r5;
+					r5 = (long) buf;
+					break;
+				}
+				buf[0] = 0;
+				strncpy_from_unsafe(buf,
+						    (void *) (long) unsafe_addr,
+						    sizeof(buf));
+			}
 			continue;
 		}
 
-- 
cgit v1.2.3


From b7dc42fd79390c074e2bff3b172b585d5c2d80c2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 3 Sep 2015 08:57:12 -0400
Subject: ring-buffer: Revert "ring-buffer: Get timestamp after event is
 allocated"

The commit a4543a2fa9ef31 "ring-buffer: Get timestamp after event is
allocated" is needed for some future work. But after adding it, there is a
race somewhere that causes the saved timestamp to have a slight shift, and
get ahead of the actual timestamp and make it look like time goes backwards.

I'm still looking into why this happens, but in the mean time, this is
holding up other work to get in. I'm reverting the change for now (which
makes the problem go away), and will add it back after I know what is wrong
and fix it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 156 +++++++++++++++------------------------------
 1 file changed, 50 insertions(+), 106 deletions(-)

(limited to 'kernel/trace')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1cce0fbf92ce..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2141,8 +2141,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
-
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2157,16 +2155,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	int ret;
 	u64 ts;
 
-	/*
-	 * If the event had a timestamp attached to it, remove it.
-	 * The first event on a page (nested or not) always uses
-	 * the full timestamp of the new page.
-	 */
-	if (info->add_timestamp) {
-		info->add_timestamp = 0;
-		info->length -= RB_LEN_TIME_EXTEND;
-	}
-
 	next_page = tail_page;
 
 	rb_inc_page(cpu_buffer, &next_page);
@@ -2253,11 +2241,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	rb_reset_tail(cpu_buffer, tail, info);
 
-	/* Commit what we have for now to update timestamps */
-	rb_end_commit(cpu_buffer);
-	/* rb_end_commit() decs committing */
-	local_inc(&cpu_buffer->committing);
-
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
@@ -2287,6 +2270,9 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 	return skip_time_extend(event);
 }
 
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+				     struct ring_buffer_event *event);
+
 /**
  * rb_update_event - update event type and data
  * @event: the event to update
@@ -2298,7 +2284,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  * and with this, we can determine what to place into the
  * data field.
  */
-static void __always_inline
+static void
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 		struct ring_buffer_event *event,
 		struct rb_event_info *info)
@@ -2306,6 +2292,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned length = info->length;
 	u64 delta = info->delta;
 
+	/* Only a commit updates the timestamp */
+	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+		delta = 0;
+
 	/*
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
@@ -2685,13 +2675,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static noinline void
 rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct ring_buffer_event *event,
 		    struct rb_event_info *info)
 {
-	struct ring_buffer_event *padding;
-	int length;
-	int size;
-
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
@@ -2701,61 +2686,7 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-
-	/*
-	 * Discarding this event to add a timestamp in front, but
-	 * we still need to update the length of it to perform the discard.
-	 */
-	rb_update_event(cpu_buffer, event, info);
-
-	if (rb_try_to_discard(cpu_buffer, event)) {
-		info->add_timestamp = 1;
-		/*
-		 * The time delta since the last event is too big to
-		 * hold in the time field of the event, then we append a
-		 * TIME EXTEND event ahead of the data event.
-		 */
-		info->length += RB_LEN_TIME_EXTEND;
-		return;
-	}
-
-	/*
-	 * Humpf! An event came in after this one, and because it is not a
-	 * commit, it will have a delta of zero, thus, it will take on
-	 * the timestamp of the previous commit, which happened a long time
-	 * ago (we need to add a timestamp, remember?).
-	 * We need to add the timestamp here. A timestamp is a fixed size
-	 * of 8 bytes. That means the rest of the event needs to be
-	 * padding.
-	 */
-	size = info->length - RB_LEN_TIME_EXTEND;
-
-	/* The padding will have a delta of 1 */
-	if (size)
-		info->delta--;
-
-	padding = rb_add_time_stamp(event, info->delta);
-
-	if (size) {
-		length = info->length;
-		info->delta = 0;
-		info->length = size;
-		rb_update_event(cpu_buffer, padding, info);
-
-		rb_event_discard(padding);
-
-		/* Still visible, need to update write_stamp */
-		rb_update_write_stamp(cpu_buffer, event);
-
-		/* Still need to commit the padding. */
-		rb_end_commit(cpu_buffer);
-
-		/* rb_end_commit() decs committing */
-		local_inc(&cpu_buffer->committing);
-
-		/* The next iteration still uses the original length */
-		info->length = length;
-	}
+	info->add_timestamp = 1;
 }
 
 static struct ring_buffer_event *
@@ -2765,7 +2696,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write;
-	bool is_commit;
+
+	/*
+	 * If the time delta since the last event is too big to
+	 * hold in the time field of the event, then we append a
+	 * TIME EXTEND event ahead of the data event.
+	 */
+	if (unlikely(info->add_timestamp))
+		info->length += RB_LEN_TIME_EXTEND;
 
 	tail_page = info->tail_page = cpu_buffer->tail_page;
 	write = local_add_return(info->length, &tail_page->write);
@@ -2774,43 +2712,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	write &= RB_WRITE_MASK;
 	tail = write - info->length;
 
-	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE))
-		return rb_move_tail(cpu_buffer, tail, info);
-
-	/* We reserved something on the buffer */
-	event = __rb_page_index(tail_page, tail);
-
 	/*
 	 * If this is the first commit on the page, then it has the same
-	 * timestamp as the page itself, otherwise we need to figure out
-	 * the delta.
+	 * timestamp as the page itself.
 	 */
-	info->ts = rb_time_stamp(cpu_buffer->buffer);
-	is_commit = rb_event_is_commit(cpu_buffer, event);
-
-	/* Commits are special (non nested events) */
-	info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0;
-
-	if (!tail) {
-		/*
-		 * If this is the first commit on the page, set the
-		 * page to its timestamp.
-		 */
-		tail_page->page->time_stamp = info->ts;
+	if (!tail)
 		info->delta = 0;
 
-	} else if (unlikely(test_time_stamp(info->delta)) &&
-		   !info->add_timestamp) {
-		rb_handle_timestamp(cpu_buffer, event, info);
-		return ERR_PTR(-EAGAIN);
-	}
+	/* See if we shot pass the end of this buffer page */
+	if (unlikely(write > BUF_PAGE_SIZE))
+		return rb_move_tail(cpu_buffer, tail, info);
 
+	/* We reserved something on the buffer */
+
+	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
 
+	/*
+	 * If this is the first commit on the page, then update
+	 * its timestamp.
+	 */
+	if (!tail)
+		tail_page->page->time_stamp = info->ts;
+
 	/* account for these added bytes */
 	local_add(info->length, &cpu_buffer->entries_bytes);
 
@@ -2825,6 +2752,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
+	u64 diff;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2842,9 +2770,12 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		return NULL;
 	}
 #endif
+
 	info.length = rb_calculate_event_length(length);
-	info.add_timestamp = 0;
  again:
+	info.add_timestamp = 0;
+	info.delta = 0;
+
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
 	 * If one does, it will cause this original code to loop
@@ -2857,6 +2788,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
+	info.ts = rb_time_stamp(cpu_buffer->buffer);
+	diff = info.ts - cpu_buffer->write_stamp;
+
+	/* make sure this diff is calculated here */
+	barrier();
+
+	/* Did the write stamp get updated already? */
+	if (likely(info.ts >= cpu_buffer->write_stamp)) {
+		info.delta = diff;
+		if (unlikely(test_time_stamp(info.delta)))
+			rb_handle_timestamp(cpu_buffer, &info);
+	}
+
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
-- 
cgit v1.2.3