From 9fe6b778ca93e6171dbb8e54df557a278a91abea Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:34 +0300 Subject: tracing: Prefer kcalloc over kzalloc with multiply Use kcalloc for allocating an array instead of kzalloc with multiply, as that is what kcalloc is used for. Found with checkpatch. Link: http://lkml.kernel.org/r/1433835155-6894-2-git-send-email-gilf@ezchip.com Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..5d219384b4d1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; -- cgit v1.2.3 From 72917235fd5f08638be1d52dcdb0fee3ce2cc95f Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:35 +0300 Subject: tracing: Fix for non-continuous cpu ids Currently exception occures due to access beyond buffer_iter range while using index of cpu bigger than num_possible_cpus(). Below there is an example for such exception when we use cpus 0,1,16,17. In order to fix buffer allocation size for non-continuous cpu ids we allocate according to the max cpu id and not according to the amount of possible cpus. Example: $ cat /sys/kernel/debug/tracing/per_cpu/cpu1/trace Path: /bin/busybox CPU: 0 PID: 82 Comm: cat Not tainted 4.0.0 #29 task: 80734c80 ti: 80012000 task.ti: 80012000 [ECR ]: 0x00220100 => Invalid Read @ 0x00000000 by insn @ 0x800abafc [EFA ]: 0x00000000 [BLINK ]: ring_buffer_read_finish+0x24/0x64 [ERET ]: rb_check_pages+0x20/0x188 [STAT32]: 0x00001a00 : BTA: 0x800abafc SP: 0x80013f0c FP: 0x57719cf8 LPS: 0x200036b4 LPE: 0x200036b8 LPC: 0x00000000 r00: 0x8002aca0 r01: 0x00001606 r02: 0x00000000 r03: 0x00000001 r04: 0x00000000 r05: 0x804b4954 r06: 0x00030003 r07: 0x8002a260 r08: 0x00000286 r09: 0x00080002 r10: 0x00001006 r11: 0x807351a4 r12: 0x00000001 Stack Trace: rb_check_pages+0x20/0x188 ring_buffer_read_finish+0x24/0x64 tracing_release+0x4e/0x170 __fput+0x62/0x158 task_work_run+0xa2/0xd4 do_notify_resume+0x52/0x7c resume_user_mode_begin+0xdc/0xe0 Link: http://lkml.kernel.org/r/1433835155-6894-3-git-send-email-gilf@ezchip.com Signed-off-by: Noam Camus Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d219384b4d1..59814adc39d6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; -- cgit v1.2.3 From 5e2d5ef8ec1e3854daec41a3697a8d2ce05ff2ef Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:55:06 +0530 Subject: ftrace: correct the counter increment for trace_buffer data In ftrace_dump, for disabling buffer, iter.tr->trace_buffer.data is used. But for enabling, iter.trace_buffer->data is used. Even though, both point to same buffer, for readability, same convention should be used. Link: http://lkml.kernel.org/r/1434972306-20043-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59814adc39d6..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; -- cgit v1.2.3 From fcc742eaad7cbcbbb2a96edc8f1d22adbaa804cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:13:14 -0400 Subject: ring-buffer: Add event descriptor to simplify passing data Add rb_event_info descriptor to pass event info to functions a bit easier than using a bunch of parameters. This will also allow for changing the code around a bit to find better fast paths. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 91 ++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 39 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..ba8f25ffcf6f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -399,6 +399,17 @@ struct rb_irq_work { bool wakeup_full; }; +/* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + /* * Used for which event context the event is in. * NMI = 0 @@ -2000,9 +2011,12 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) + struct ring_buffer_event *event, + struct rb_event_info *info) { + unsigned length = info->length; + u64 delta = info->delta; + /* Only a commit updates the timestamp */ if (unlikely(!rb_event_is_commit(cpu_buffer, event))) delta = 0; @@ -2011,7 +2025,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * If we need to add a timestamp, then we * add it to the start of the resevered space. */ - if (unlikely(add_timestamp)) { + if (unlikely(info->add_timestamp)) { event = rb_add_time_stamp(event, delta); length -= RB_LEN_TIME_EXTEND; delta = 0; @@ -2203,10 +2217,11 @@ static unsigned rb_calculate_event_length(unsigned length) static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2291,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,25 +2384,24 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) + struct rb_event_info *info) { - struct buffer_page *tail_page; struct ring_buffer_event *event; + struct buffer_page *tail_page; unsigned long tail, write; /* @@ -2394,33 +2409,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * hold in the time field of the event, then we append a * TIME EXTEND event ahead of the data event. */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; - tail = write - length; + tail = write - info->length; /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (!tail) - delta = 0; + info->delta = 0; /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); @@ -2429,10 +2443,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = ts; + tail_page->page->time_stamp = info->ts; /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + local_add(info->length, &cpu_buffer->entries_bytes); return event; } @@ -2521,9 +2535,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + struct rb_event_info info; int nr_loops = 0; - int add_timestamp; u64 diff; rb_start_commit(cpu_buffer); @@ -2543,10 +2556,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, } #endif - length = rb_calculate_event_length(length); + info.length = rb_calculate_event_length(length); again: - add_timestamp = 0; - delta = 0; + info.add_timestamp = 0; + info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. @@ -2560,35 +2573,35 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK local_clock_stable = sched_clock_stable(); #endif - WARN_ONCE(delta > (1ULL << 59), + WARN_ONCE(info.delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, + (unsigned long long)info.delta, + (unsigned long long)info.ts, (unsigned long long)cpu_buffer->write_stamp, local_clock_stable ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; + info.add_timestamp = 1; } } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); + event = __rb_reserve_next(cpu_buffer, &info); + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; -- cgit v1.2.3 From 9826b2733a4399149072058a11f611357479229d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:36:45 -0400 Subject: ring-buffer: Move the adding of the extended timestamp out of line Requiring a extended time stamp is an uncommon occurrence, and it is best to do it out of line when needed. Add a noinline function that handles the extended timestamp and have it called with an unlikely to completely move it out of the fast path. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba8f25ffcf6f..a78d4ee4bc58 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2396,6 +2396,29 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -2582,22 +2605,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* Did the write stamp get updated already? */ if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(info.delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info.delta, - (unsigned long long)info.ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info.add_timestamp = 1; - } + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); } event = __rb_reserve_next(cpu_buffer, &info); -- cgit v1.2.3 From a4543a2fa9ef31d6d0f854a4e14f8f82e7996d8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 09:40:18 -0400 Subject: ring-buffer: Get timestamp after event is allocated Move the capturing of the timestamp to after an event is allocated. If the event is not a commit (where it is an event that preempted another event), then no timestamp is needed, because the delta of nested events is always zero. If the event starts on a new page, no delta needs to be calculated as the full timestamp will be added to the page header, and the event will have a delta of zero. Now if the event requires a time extend (the delta does not fit in the 27 bit delta slot in the header), then the event is discarded, the length is extended to hold the TIME_EXTEND event that allows for a 59 bit delta, and the commit is tried again. If the event can't be discarded (another event came in after it), then the TIME_EXTEND is added directly to the allocated event and the rest of the event is given padding. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 161 ++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 47 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a78d4ee4bc58..b5ed553e0a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2009,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) * and with this, we can determine what to place into the * data field. */ -static void +static void __always_inline rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) @@ -2017,10 +2017,6 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the resevered space. @@ -2286,6 +2282,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2300,6 +2298,16 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, int ret; u64 ts; + /* + * If the event had a timestamp attached to it, remove it. + * The first event on a page (nested or not) always uses + * the full timestamp of the new page. + */ + if (info->add_timestamp) { + info->add_timestamp = 0; + info->length -= RB_LEN_TIME_EXTEND; + } + next_page = tail_page; rb_inc_page(cpu_buffer, &next_page); @@ -2386,6 +2394,11 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now to update timestamps */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2403,10 +2416,23 @@ static inline bool sched_clock_stable(void) } #endif +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); +static inline void rb_event_discard(struct ring_buffer_event *event); +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + static noinline void rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, struct rb_event_info *info) { + struct ring_buffer_event *padding; + int length; + int size; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, @@ -2416,7 +2442,61 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info->add_timestamp = 1; + + /* + * Discarding this event to add a timestamp in front, but + * we still need to update the length of it to perform the discard. + */ + rb_update_event(cpu_buffer, event, info); + + if (rb_try_to_discard(cpu_buffer, event)) { + info->add_timestamp = 1; + /* + * The time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + info->length += RB_LEN_TIME_EXTEND; + return; + } + + /* + * Humpf! An event came in after this one, and because it is not a + * commit, it will have a delta of zero, thus, it will take on + * the timestamp of the previous commit, which happened a long time + * ago (we need to add a timestamp, remember?). + * We need to add the timestamp here. A timestamp is a fixed size + * of 8 bytes. That means the rest of the event needs to be + * padding. + */ + size = info->length - RB_LEN_TIME_EXTEND; + + /* The padding will have a delta of 1 */ + if (size) + info->delta--; + + padding = rb_add_time_stamp(event, info->delta); + + if (size) { + length = info->length; + info->delta = 0; + info->length = size; + rb_update_event(cpu_buffer, padding, info); + + rb_event_discard(padding); + + /* Still visible, need to update write_stamp */ + rb_update_write_stamp(cpu_buffer, event); + + /* Still need to commit the padding. */ + rb_end_commit(cpu_buffer); + + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + + /* The next iteration still uses the original length */ + info->length = length; + } } static struct ring_buffer_event * @@ -2426,14 +2506,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; + bool is_commit; tail_page = info->tail_page = cpu_buffer->tail_page; write = local_add_return(info->length, &tail_page->write); @@ -2442,31 +2515,42 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - info->length; - /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. - */ - if (!tail) - info->delta = 0; - /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, info); - - local_inc(&tail_page->entries); /* - * If this is the first commit on the page, then update - * its timestamp. + * If this is the first commit on the page, then it has the same + * timestamp as the page itself, otherwise we need to figure out + * the delta. */ - if (!tail) + info->ts = rb_time_stamp(cpu_buffer->buffer); + is_commit = rb_event_is_commit(cpu_buffer, event); + + /* Commits are special (non nested events) */ + info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0; + + if (!tail) { + /* + * If this is the first commit on the page, set the + * page to its timestamp. + */ tail_page->page->time_stamp = info->ts; + info->delta = 0; + + } else if (unlikely(test_time_stamp(info->delta)) && + !info->add_timestamp) { + rb_handle_timestamp(cpu_buffer, event, info); + return ERR_PTR(-EAGAIN); + } + + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); @@ -2560,7 +2644,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; rb_start_commit(cpu_buffer); @@ -2578,12 +2661,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } #endif - info.length = rb_calculate_event_length(length); - again: info.add_timestamp = 0; - info.delta = 0; - + again: /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2596,19 +2676,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) -- cgit v1.2.3 From 7d75e6833b579adb3de2c7b917de1204eeafea47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 10:29:10 -0400 Subject: ring-buffer: Make sure event has enough room for extend and padding Now that events only add time extends after it is committed, in case an event comes in before it can discard the allocated event, the time extend needs to be stored within the event. If the event is bigger than then size needed for the time extend, padding must be added. The minimum padding size is 8 bytes. Thus if the event is 12 bytes (size of time extend + 4), there will not be enough room to add both the time extend and padding. Make sure all events are either 8 bytes or 16 or more bytes. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b5ed553e0a45..781ce359976c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2208,6 +2208,21 @@ static unsigned rb_calculate_event_length(unsigned length) length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + return length; } -- cgit v1.2.3 From d90fd77402d3de56a9ca3df04e5d868d0979dc59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 12:12:27 -0400 Subject: ring-buffer: Reorganize function locations Functions in ring-buffer.c have gotten interleaved between different use cases. Move the functions around to get like functions closer together. This may or may not help gcc keep cache locality, but it makes it a little easier to work with the code. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 920 ++++++++++++++++++++++----------------------- 1 file changed, 456 insertions(+), 464 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 781ce359976c..1cce0fbf92ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1979,63 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void __always_inline -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, - struct rb_event_info *info) -{ - unsigned length = info->length; - u64 delta = info->delta; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2194,38 +2070,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - /* - * In case the time delta is larger than the 27 bits for it - * in the header, we need to add a timestamp. If another - * event comes in when trying to discard this one to increase - * the length, then the timestamp will be added in the allocated - * space of this event. If length is bigger than the size needed - * for the TIME_EXTEND, then padding has to be used. The events - * length must be either RB_LEN_TIME_EXTEND, or greater than or equal - * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. - * As length is a multiple of 4, we only need to worry if it - * is 12 (RB_LEN_TIME_EXTEND + 4). - */ - if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) - length += RB_ALIGNMENT; - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) @@ -2424,71 +2268,471 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - return true; -} -#endif + event->type_len = RINGBUF_TYPE_TIME_EXTEND; -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); -static inline void rb_event_discard(struct ring_buffer_event *event); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } -static noinline void -rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, - struct rb_event_info *info) -{ - struct ring_buffer_event *padding; - int length; - int size; + return skip_time_extend(event); +} - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info->delta, - (unsigned long long)info->ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable() ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void __always_inline +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; /* - * Discarding this event to add a timestamp in front, but - * we still need to update the length of it to perform the discard. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - rb_update_event(cpu_buffer, event, info); - - if (rb_try_to_discard(cpu_buffer, event)) { - info->add_timestamp = 1; - /* - * The time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - info->length += RB_LEN_TIME_EXTEND; - return; + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } - /* - * Humpf! An event came in after this one, and because it is not a - * commit, it will have a delta of zero, thus, it will take on - * the timestamp of the previous commit, which happened a long time - * ago (we need to add a timestamp, remember?). - * We need to add the timestamp here. A timestamp is a fixed size - * of 8 bytes. That means the rest of the event needs to be - * padding. - */ - size = info->length - RB_LEN_TIME_EXTEND; - - /* The padding will have a delta of 1 */ - if (size) - info->delta--; + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length++; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + + return length; +} + +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); + return 1; + } + } + + /* could not discard */ + return 0; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} + +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; + + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ + +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned int val = cpu_buffer->current_context; + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = RB_CTX_NMI; + else if (in_irq()) + bit = RB_CTX_IRQ; + else + bit = RB_CTX_SOFTIRQ; + } else + bit = RB_CTX_NORMAL; + + if (unlikely(val & (1 << bit))) + return 1; + + val |= (1 << bit); + cpu_buffer->current_context = val; + + return 0; +} + +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->current_context &= cpu_buffer->current_context - 1; +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + struct ring_buffer_event *padding; + int length; + int size; + + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + + /* + * Discarding this event to add a timestamp in front, but + * we still need to update the length of it to perform the discard. + */ + rb_update_event(cpu_buffer, event, info); + + if (rb_try_to_discard(cpu_buffer, event)) { + info->add_timestamp = 1; + /* + * The time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + info->length += RB_LEN_TIME_EXTEND; + return; + } + + /* + * Humpf! An event came in after this one, and because it is not a + * commit, it will have a delta of zero, thus, it will take on + * the timestamp of the previous commit, which happened a long time + * ago (we need to add a timestamp, remember?). + * We need to add the timestamp here. A timestamp is a fixed size + * of 8 bytes. That means the rest of the event needs to be + * padding. + */ + size = info->length - RB_LEN_TIME_EXTEND; + + /* The padding will have a delta of 1 */ + if (size) + info->delta--; padding = rb_add_time_stamp(event, info->delta); @@ -2573,84 +2817,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; - - new_index = rb_event_index(event); - old_index = new_index + rb_event_ts_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - unsigned long write_mask = - local_read(&bpage->write) & ~RB_WRITE_MASK; - unsigned long event_length = rb_event_length(event); - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - old_index += write_mask; - new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { - /* update counters */ - local_sub(event_length, &cpu_buffer->entries_bytes); - return 1; - } - } - - /* could not discard */ - return 0; -} - -static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - local_inc(&cpu_buffer->committing); - local_inc(&cpu_buffer->commits); -} - -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long commits; - - if (RB_WARN_ON(cpu_buffer, - !local_read(&cpu_buffer->committing))) - return; - - again: - commits = local_read(&cpu_buffer->commits); - /* synchronize with interrupts */ - barrier(); - if (local_read(&cpu_buffer->committing) == 1) - rb_set_commit_to_write(cpu_buffer); - - local_dec(&cpu_buffer->committing); - - /* synchronize with interrupts */ - barrier(); - - /* - * Need to account for interrupts coming in between the - * updating of the commit page and the clearing of the - * committing counter. - */ - if (unlikely(local_read(&cpu_buffer->commits) != commits) && - !local_read(&cpu_buffer->committing)) { - local_inc(&cpu_buffer->committing); - goto again; - } -} - static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, @@ -2706,75 +2872,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } -/* - * The lock and unlock are done within a preempt disable section. - * The current_context per_cpu variable can only be modified - * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. To pass this - * information from the lock to the unlock without having to - * access the 'in_interrupt()' functions again (which do show - * a bit of overhead in something as critical as function tracing, - * we use a bitmask trick. - * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. - * - * This works because this is the order of contexts that can - * preempt other contexts. A SoftIRQ never preempts an IRQ - * context. - * - * When the context is determined, the corresponding bit is - * checked and set (if it was set, then a recursion of that context - * happened). - * - * On unlock, we need to clear this bit. To do so, just subtract - * 1 from the current_context and AND it to itself. - * - * (binary) - * 101 - 1 = 100 - * 101 & 100 = 100 (clearing bit zero) - * - * 1010 - 1 = 1001 - * 1010 & 1001 = 1000 (clearing bit 1) - * - * The least significant bit can be cleared this way, and it - * just so happens that it is the same bit corresponding to - * the current context. - */ - -static __always_inline int -trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned int val = cpu_buffer->current_context; - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = RB_CTX_NMI; - else if (in_irq()) - bit = RB_CTX_IRQ; - else - bit = RB_CTX_SOFTIRQ; - } else - bit = RB_CTX_NORMAL; - - if (unlikely(val & (1 << bit))) - return 1; - - val |= (1 << bit); - cpu_buffer->current_context = val; - - return 0; -} - -static __always_inline void -trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->current_context &= cpu_buffer->current_context - 1; -} - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2833,111 +2930,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer -- cgit v1.2.3 From 72ac426a5bb0cec572d26b4456f8c1e14601694e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 13:24:54 -0400 Subject: tracing: Clean up stack tracing and fix fentry updates Akashi Takahiro was porting the stack tracer to arm64 and found some issues with it. One was that it repeats the top function, due to the stack frame added by the mcount caller and added by itself. This was added when fentry came in, and before fentry created its own stack frame. But x86's fentry now creates its own stack frame, and there's no need to insert the function again. This also cleans up the code a bit, where it doesn't need to do something special for fentry, and doesn't include insertion of a duplicate entry for the called function being traced. Link: http://lkml.kernel.org/r/55A646EE.6030402@linaro.org Some-suggestions-by: Jungseok Lee Some-suggestions-by: Mark Rutland Reported-by: AKASHI Takahiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 68 ++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 45 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); -- cgit v1.2.3 From 8e436ca042d904533a1e14fdc85f0facdfca752f Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:58:08 +0530 Subject: ftrace: add tracing_thresh to function profile This patch extends tracing_thresh functionality to function profile tracer. If tracing_thresh is set, print those entries only, whose average is > tracing thresh. Link: http://lkml.kernel.org/r/1434972488-8571-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari [ Removed unnecessary 'moved' comment ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea..f46dbb5cdf76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -613,13 +613,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) -- cgit v1.2.3 From 82c355e81afbf16bc1ab379899a79eb66e2b7504 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 21:58:52 -0400 Subject: ftrace: Fix function_graph duration spacing with 7-digits Jungseok Lee noticed the following: Currently, row's width of 7-digit duration numbers not aligned with other cases like the following example. 3) $ 3999884 us | } 3) | finish_task_switch() { 3) 0.365 us | _raw_spin_unlock_irq(); 3) 3.333 us | } 3) $ 3999976 us | } 3) $ 3999979 us | } /* schedule */ As adding a single white space in case of 7-digit numbers, the format could be unified easily as follows. 3) $ 2237472 us | } 3) | finish_task_switch() { 3) 0.364 us | _raw_spin_unlock_irq(); 3) 3.125 us | } 3) $ 2237556 us | } 3) $ 2237559 us | } /* schedule */ Instead of making a special case for 7-digit numbers, the logic of the len and the space loop is slightly modified to make the two cases have the same format. Link: http://lkml.kernel.org/r/1436626300-1679-2-git-send-email-jungseoklee85@gmail.com Reported-by: Jungseok Lee Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } -- cgit v1.2.3 From b838e1d96c613019095ba008afbee800977b0582 Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Sat, 11 Jul 2015 14:51:40 +0000 Subject: tracing: Introduce two additional marks for delay A fine granulity support for delay would be very useful when profiling VM logics, such as page allocation including page reclaim and memory compaction with function graph. Thus, this patch adds two additional marks with two changes. - An equal sign in mark selection function is removed to align code behavior with comments and documentation. - The function graph example related to delay in ftrace.txt is updated to cover all supported marks. Link: http://lkml.kernel.org/r/1436626300-1679-3-git-send-email-jungseoklee85@gmail.com Cc: Byungchul Park Signed-off-by: Jungseok Lee Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } -- cgit v1.2.3 From 4246a0b63bd8f56a1469b12eafeb875b1041a451 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Jul 2015 15:29:37 +0200 Subject: block: add a bi_error field to struct bio Currently we have two different ways to signal an I/O error on a BIO: (1) by clearing the BIO_UPTODATE flag (2) by returning a Linux errno value to the bi_end_io callback The first one has the drawback of only communicating a single possible error (-EIO), and the second one has the drawback of not beeing persistent when bios are queued up, and are not passed along from child to parent bio in the ever more popular chaining scenario. Having both mechanisms available has the additional drawback of utterly confusing driver authors and introducing bugs where various I/O submitters only deal with one of them, and the others have to add boilerplate code to deal with both kinds of error returns. So add a new bi_error field to store an errno value directly in struct bio and remove the existing mechanisms to clean all this up. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: NeilBrown Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b3e6b39b6cf9..90e72a0c3047 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - if (!error && !bio_flagged(bio, BIO_UPTODATE)) - error = EIO; - __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, what, error, 0, NULL); } @@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu); } } @@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + sizeof(r), &r); } /** -- cgit v1.2.3 From fbd705a0c6184580d0e2fbcbd47a37b6e5822511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jun 2015 11:13:36 +0200 Subject: sched: Introduce the 'trace_sched_waking' tracepoint Mathieu reported that since 317f394160e9 ("sched: Move the second half of ttwu() to the remote cpu") trace_sched_wakeup() can happen out of context of the waker. This is a problem when you want to analyse wakeup paths because it is now very hard to correlate the wakeup event to whoever issued the wakeup. OTOH trace_sched_wakeup() is issued at the point where we set p->state = TASK_RUNNING, which is right were we hand the task off to the scheduler, so this is an important point when looking at scheduling behaviour, up to here its been the wakeup path everything hereafter is due to scheduler policy. To bridge this gap, introduce a second tracepoint: trace_sched_waking. It is guaranteed to be called in the waker context. Reported-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Francis Giraldeau Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150609091336.GQ3644@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 419ca37e72c9..f270088e9929 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n } static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9b33dd117f3f..12cbe77b4136 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(void *ignore, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); -- cgit v1.2.3 From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:50 +0000 Subject: tracing, perf: Implement BPF programs attached to uprobes By copying BPF related operation to uprobe processing path, this patch allow users attach BPF programs to uprobes like what they are already doing on kprobes. After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a uprobe perf event. Which make it possible to profile user space programs and kernel events together using BPF. Because of this patch, CONFIG_BPF_EVENTS should be selected by CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if KPROBE_EVENT is not set. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/Kconfig | 2 +- kernel/trace/trace_uprobe.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..1153c43428f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..f97479f1ce35 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); -- cgit v1.2.3 From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 6 Aug 2015 07:02:35 +0000 Subject: bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter According to the perf_event_map_fd and index, the function bpf_perf_event_read() can convert the corresponding map value to the pointer to struct perf_event and return the Hardware PMU counter value. Signed-off-by: Kaixu Xia Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..ef9936df1b04 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; default: return NULL; } -- cgit v1.2.3 From 9f61668073a8d80650622e792aff876db9ca23c6 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 10 Aug 2015 14:35:46 +0200 Subject: tracing: Allow triggers to filter for CPU ids and process names By extending the filter rules by more generic fields we can write triggers filters like echo 'stacktrace if cpu == 1' > \ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger or echo 'stacktrace if comm == sshd' > \ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger CPU and COMM are not part of struct trace_entry. We could add the two new fields to ftrace_common_field list and fix up all depending sides. But that looks pretty ugly. Another thing I would like to avoid that the 'format' file contents changes. All this can be avoided by introducing another list which contains non field members of struct trace_entry. Link: http://lkml.kernel.org/r/1439210146-24707-1-git-send-email-daniel.wagner@bmw-carit.de Signed-off-by: Daniel Wagner Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 25 ++++++++++++++++++ kernel/trace/trace_events_filter.c | 54 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..7ca09cdc20c2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -30,6 +30,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + field = __find_event_field(&ftrace_common_fields, name); if (field) return field; @@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, cpu, FILTER_OTHER); + __generic_field(char *, comm, FILTER_PTR_STRING); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -2671,6 +2693,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..bd1bf184c5c9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, if (is_string_field(field)) { filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) { + if (!strcmp(field->name, "comm")) { + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (!strcmp(field->name, "cpu")) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); -- cgit v1.2.3 From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:46 +0000 Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0 When manually added uprobe point with zero address, 'uprobe_events' output '(null)' instead of 0x00000000: # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \ /sys/kernel/debug/tracing/uprobe_events # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x (null) arg1=%ax This patch fixes this behavior: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000 Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_uprobe.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f97479f1ce35..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v1.2.3 From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Aug 2015 15:56:22 -0700 Subject: lib: introduce strncpy_from_unsafe() generalize FETCH_FUNC_NAME(memory, string) into strncpy_from_unsafe() and fix sparse warnings that were present in original implementation. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/trace_kprobe.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); -- cgit v1.2.3 From 8d3b7dce8622919da5c5822ef7338d6604c9fe6e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Aug 2015 15:56:23 -0700 Subject: bpf: add support for %s specifier to bpf_trace_printk() %s specifier makes bpf program and kernel debugging easier. To make sure that trace_printk won't crash the unsafe string is copied into stack and unsafe pointer is substituted. The following C program: #include int foo(struct pt_regs *ctx, struct filename *filename) { void *name = 0; bpf_probe_read(&name, sizeof(name), &filename->name); bpf_trace_printk("executed %s\n", name); return 0; } when attached to kprobe do_execve() will produce output in /sys/kernel/debug/tracing/trace_pipe : make-13492 [002] d..1 3250.997277: : executed /bin/sh sh-13493 [004] d..1 3250.998716: : executed /usr/bin/gcc gcc-13494 [002] d..1 3250.999822: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/cc1 gcc-13495 [002] d..1 3251.006731: : executed /usr/bin/as gcc-13496 [002] d..1 3251.011831: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/collect2 collect2-13497 [000] d..1 3251.012941: : executed /usr/bin/ld Suggested-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ef9936df1b04..0fe96c7c8803 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = { /* * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) { char *fmt = (char *) (long) r1; + bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; + u64 unsafe_addr; + char buf[64]; int i; /* @@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p') { + } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; i++; if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) return -EINVAL; fmt_cnt++; + if (fmt[i - 1] == 's') { + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + switch (fmt_cnt) { + case 1: + unsafe_addr = r3; + r3 = (long) buf; + break; + case 2: + unsafe_addr = r4; + r4 = (long) buf; + break; + case 3: + unsafe_addr = r5; + r5 = (long) buf; + break; + } + buf[0] = 0; + strncpy_from_unsafe(buf, + (void *) (long) unsafe_addr, + sizeof(buf)); + } continue; } -- cgit v1.2.3 From b7dc42fd79390c074e2bff3b172b585d5c2d80c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 3 Sep 2015 08:57:12 -0400 Subject: ring-buffer: Revert "ring-buffer: Get timestamp after event is allocated" The commit a4543a2fa9ef31 "ring-buffer: Get timestamp after event is allocated" is needed for some future work. But after adding it, there is a race somewhere that causes the saved timestamp to have a slight shift, and get ahead of the actual timestamp and make it look like time goes backwards. I'm still looking into why this happens, but in the mean time, this is holding up other work to get in. I'm reverting the change for now (which makes the problem go away), and will add it back after I know what is wrong and fix it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 156 +++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 106 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1cce0fbf92ce..fc347f8b1bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2141,8 +2141,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); - /* * This is the slow path, force gcc not to inline it. */ @@ -2157,16 +2155,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, int ret; u64 ts; - /* - * If the event had a timestamp attached to it, remove it. - * The first event on a page (nested or not) always uses - * the full timestamp of the new page. - */ - if (info->add_timestamp) { - info->add_timestamp = 0; - info->length -= RB_LEN_TIME_EXTEND; - } - next_page = tail_page; rb_inc_page(cpu_buffer, &next_page); @@ -2253,11 +2241,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail, info); - /* Commit what we have for now to update timestamps */ - rb_end_commit(cpu_buffer); - /* rb_end_commit() decs committing */ - local_inc(&cpu_buffer->committing); - /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2287,6 +2270,9 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) return skip_time_extend(event); } +static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + /** * rb_update_event - update event type and data * @event: the event to update @@ -2298,7 +2284,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) * and with this, we can determine what to place into the * data field. */ -static void __always_inline +static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) @@ -2306,6 +2292,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; + /* * If we need to add a timestamp, then we * add it to the start of the resevered space. @@ -2685,13 +2675,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static noinline void rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, struct rb_event_info *info) { - struct ring_buffer_event *padding; - int length; - int size; - WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, @@ -2701,61 +2686,7 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - - /* - * Discarding this event to add a timestamp in front, but - * we still need to update the length of it to perform the discard. - */ - rb_update_event(cpu_buffer, event, info); - - if (rb_try_to_discard(cpu_buffer, event)) { - info->add_timestamp = 1; - /* - * The time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - info->length += RB_LEN_TIME_EXTEND; - return; - } - - /* - * Humpf! An event came in after this one, and because it is not a - * commit, it will have a delta of zero, thus, it will take on - * the timestamp of the previous commit, which happened a long time - * ago (we need to add a timestamp, remember?). - * We need to add the timestamp here. A timestamp is a fixed size - * of 8 bytes. That means the rest of the event needs to be - * padding. - */ - size = info->length - RB_LEN_TIME_EXTEND; - - /* The padding will have a delta of 1 */ - if (size) - info->delta--; - - padding = rb_add_time_stamp(event, info->delta); - - if (size) { - length = info->length; - info->delta = 0; - info->length = size; - rb_update_event(cpu_buffer, padding, info); - - rb_event_discard(padding); - - /* Still visible, need to update write_stamp */ - rb_update_write_stamp(cpu_buffer, event); - - /* Still need to commit the padding. */ - rb_end_commit(cpu_buffer); - - /* rb_end_commit() decs committing */ - local_inc(&cpu_buffer->committing); - - /* The next iteration still uses the original length */ - info->length = length; - } + info->add_timestamp = 1; } static struct ring_buffer_event * @@ -2765,7 +2696,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write; - bool is_commit; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; tail_page = info->tail_page = cpu_buffer->tail_page; write = local_add_return(info->length, &tail_page->write); @@ -2774,43 +2712,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - info->length; - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, tail, info); - - /* We reserved something on the buffer */ - event = __rb_page_index(tail_page, tail); - /* * If this is the first commit on the page, then it has the same - * timestamp as the page itself, otherwise we need to figure out - * the delta. + * timestamp as the page itself. */ - info->ts = rb_time_stamp(cpu_buffer->buffer); - is_commit = rb_event_is_commit(cpu_buffer, event); - - /* Commits are special (non nested events) */ - info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0; - - if (!tail) { - /* - * If this is the first commit on the page, set the - * page to its timestamp. - */ - tail_page->page->time_stamp = info->ts; + if (!tail) info->delta = 0; - } else if (unlikely(test_time_stamp(info->delta)) && - !info->add_timestamp) { - rb_handle_timestamp(cpu_buffer, event, info); - return ERR_PTR(-EAGAIN); - } + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = info->ts; + /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); @@ -2825,6 +2752,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; + u64 diff; rb_start_commit(cpu_buffer); @@ -2842,9 +2770,12 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } #endif + info.length = rb_calculate_event_length(length); - info.add_timestamp = 0; again: + info.add_timestamp = 0; + info.delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2857,6 +2788,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); + } + event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) -- cgit v1.2.3