From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33262004c310..fbab6235d053 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -265,6 +265,13 @@ struct pmu {
 	 * flush branch stack on context-switches (needed in cpu-wide mode)
 	 */
 	void (*flush_branch_stack)	(void);
+
+	/*
+	 * context-switches callback
+	 */
+	void (*sched_task)		(struct perf_event_context *ctx,
+					bool sched_in);
+
 };
 
 /**
@@ -558,6 +565,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu);
+extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
-- 
cgit v1.2.3


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fbab6235d053..c7007a564440 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -511,7 +511,6 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
-	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
-- 
cgit v1.2.3


From 4af57ef28c2c1047fda9e1a5be02aa7a6a69cf9e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:01 -0500
Subject: perf: Add pmu specific data for perf task context

Introduce a new flag PERF_ATTACH_TASK_DATA for perf event's attach
stata. The flag is set by PMU's event_init() callback, it indicates
that perf event needs PMU specific data.

The PMU specific data are initialized to zeros. Later patches will
use PMU specific data to save LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-6-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c7007a564440..270cd0173e61 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,6 +271,10 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
+	/*
+	 * PMU specific data size
+	 */
+	size_t				task_ctx_size;
 
 };
 
@@ -307,6 +311,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
+#define PERF_ATTACH_TASK_DATA	0x08
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -511,6 +516,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
+	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
-- 
cgit v1.2.3


From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:06 -0500
Subject: perf: Simplify the branch stack check

Use event->attr.branch_sample_type to replace
intel_pmu_needs_lbr_smpl() for avoiding duplicated code that
implicitly enables the LBR.

Currently, branch stack can be enabled by user explicitly requesting
branch sampling or implicit branch sampling to correct PEBS skid.

For user explicitly requested branch sampling, the branch_sample_type
is explicitly set by user. For PEBS case, the branch_sample_type is also
implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 270cd0173e61..43cc158487e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -814,6 +814,11 @@ static inline bool has_branch_stack(struct perf_event *event)
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool needs_branch_stack(struct perf_event *event)
+{
+	return event->attr.branch_sample_type != 0;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
-- 
cgit v1.2.3


From acba3c7e4652ca5fcb2fd9376d58c2dffd8ddf2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2015 14:15:39 +0100
Subject: perf, powerpc: Fix up flush_branch_stack() users

The recent LBR rework for x86 left a stray flush_branch_stack() user in
the PowerPC code, fix that up.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 43cc158487e6..724d3720c9b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -261,11 +261,6 @@ struct pmu {
 	 */
 	int (*event_idx)		(struct perf_event *event); /*optional */
 
-	/*
-	 * flush branch stack on context-switches (needed in cpu-wide mode)
-	 */
-	void (*flush_branch_stack)	(void);
-
 	/*
 	 * context-switches callback
 	 */
-- 
cgit v1.2.3


From 39bed6cbb842d8edf5a26b01122b391d36775b5e Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:40 +0000
Subject: perf: Make perf_cgroup_from_task() global

Move perf_cgroup_from_task() from kernel/events/ to include/linux/ along
with the necessary struct definitions, so that it can be used by the PMU
code.

When the upcoming Intel Cache Monitoring PMU driver assigns monitoring
IDs to perf events, it needs to be able to check whether any two
monitoring events overlap (say, a cgroup and task event), which means we
need to be able to lookup the cgroup associated with a task (if any).

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 724d3720c9b1..cae4a9481777 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
 #include <linux/sysfs.h>
 #include <linux/perf_regs.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -547,6 +548,35 @@ struct perf_output_handle {
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64				time;
+	u64				timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_event_cgrp_id),
+			    struct perf_cgroup, css);
+}
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-- 
cgit v1.2.3


From eacd3ecc34472ce3751eedfc94e44c7cc6eb6305 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:41 +0000
Subject: perf: Add ->count() function to read per-package counters

For PMU drivers that record per-package counters, the ->count variable
cannot be used to record an accurate aggregated value, since it's not
possible to perform SMP cross-calls to cpus on other packages from the
context in which we update ->count.

Introduce a new optional ->count() accessor function that can be used to
customize how values are collected. If a PMU driver doesn't provide a
->count() function, we fallback to the existing code.

There is necessarily a window of staleness with this approach because
the task that generated the counter value may not have been scheduled by
the cpu recently.

An alternative and more complex approach would be to use a hrtimer to
periodically refresh the values from a more permissive scheduling
context. So, we're trading off complexity for accuracy.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cae4a9481777..9fc9b0d31442 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -272,6 +272,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+
+	/*
+	 * Return the count value for a counter.
+	 */
+	u64 (*count)			(struct perf_event *event); /*optional*/
 };
 
 /**
@@ -770,6 +775,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }
 
+static inline u64 __perf_event_count(struct perf_event *event)
+{
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
-- 
cgit v1.2.3


From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:44 +0000
Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9fc9b0d31442..ca5504c48f4f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -123,6 +123,13 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* intel_cqm */
+			int			cqm_state;
+			int			cqm_rmid;
+			struct list_head	cqm_events_entry;
+			struct list_head	cqm_groups_entry;
+			struct list_head	cqm_group_entry;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
-- 
cgit v1.2.3


From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:46 +0000
Subject: perf/x86/intel: Support task events with Intel CQM

Add support for task events as well as system-wide events. This change
has a big impact on the way that we gather LLC occupancy values in
intel_cqm_event_read().

Currently, for system-wide (per-cpu) events we defer processing to
userspace which knows how to discard all but one cpu result per package.

Things aren't so simple for task events because we need to do the value
aggregation ourselves. To do this, we defer updating the LLC occupancy
value in event->count from intel_cqm_event_read() and do an SMP
cross-call to read values for all packages in intel_cqm_event_count().
We need to ensure that we only do this for one task event per cache
group, otherwise we'll report duplicate values.

If we're a system-wide event we want to fallback to the default
perf_event_count() implementation. Refactor this into a common function
so that we don't duplicate the code.

Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an
event's task (if the event isn't per-cpu) inside of the Intel CQM PMU
driver.  This task information is only availble in the upper layers of
the perf infrastructure.

Other perf backends stash the target task in event->hw.*target so we
need to do something similar. The task is used to determine whether
events should share a cache group and an RMID.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
+			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
-- 
cgit v1.2.3


From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2015 22:10:19 +0100
Subject: perf: Remove type specific target pointers

The only reason CQM had to use a hard-coded pmu type was so it could use
cqm_target in hw_perf_event.

Do away with the {tp,bp,cqm}_target pointers and provide a non type
specific one.

This allows us to do away with that silly pmu type as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vince@deater.net>
Cc: acme@kernel.org
Cc: acme@redhat.com
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: kanaka.d.juvva@intel.com
Cc: matt.fleming@intel.com
Cc: tglx@linutronix.de
Cc: torvalds@linux-foundation.org
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dac4c2831d82..5aa49d7bfd07 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -119,7 +119,6 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 		struct { /* tracepoint */
-			struct task_struct	*tp_target;
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
@@ -129,7 +128,6 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
@@ -138,12 +136,12 @@ struct hw_perf_event {
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
-			struct task_struct		*bp_target;
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
 		};
 #endif
 	};
+	struct task_struct		*target;
 	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
-- 
cgit v1.2.3


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
-- 
cgit v1.2.3


From 4e537f7fbdce5e8ae7c33ebaa8a1956c7727d5a7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Mar 2015 12:49:18 -0700
Subject: bpf: Make internal bpf API independent of CONFIG_BPF_SYSCALL #ifdefs

Socket filter code and other subsystems with upcoming eBPF
support should not need to deal with the fact that we have
CONFIG_BPF_SYSCALL defined or not.

Having the bpf syscall as a config option is a nice thing and
I'd expect it to stay that way for expert users (I presume one
day the default setting of it might change, though), but code
making use of it should not care if it's actually enabled or
not.

Instead, hide this via header files and let the rest deal with it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1427312966-8434-2-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/bpf.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
-- 
cgit v1.2.3


From 72cbbc8994242b5b43753738c01bf07bf29cb70d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:19 -0700
Subject: tracing: Add kprobe flag

add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of
tracepoints, since bpf programs can only be attached to kprobe
type of PERF_TYPE_TRACEPOINT perf events.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-3-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..77325e1a1816 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -252,6 +252,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +266,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +276,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
-- 
cgit v1.2.3


From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:20 -0700
Subject: tracing, perf: Implement BPF programs attached to kprobes

BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.

The user interface is to attach to a kprobe via the perf syscall:

	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};

	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

'prog_fd' is a file descriptor associated with BPF program
previously loaded.

'event_id' is an ID of the kprobe created.

Closing 'event_fd':

	close(event_fd);

... automatically detaches BPF program from it.

BPF programs can call in-kernel helper functions to:

  - lookup/update/delete elements in maps

  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures

BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.

Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 77325e1a1816..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -306,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
-- 
cgit v1.2.3


From 45bfb2e50471abbbfd83d40d28c986078b0d24ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2015 14:18:11 +0200
Subject: perf: Add AUX area to ring buffer for raw data streams

This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.

AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.

In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.

Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 401554074de9..5a94f6d6fa91 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -284,6 +284,18 @@ struct pmu {
 	 * Return the count value for a counter.
 	 */
 	u64 (*count)			(struct perf_event *event); /*optional*/
+
+	/*
+	 * Set up pmu-private data structures for an AUX area
+	 */
+	void *(*setup_aux)		(int cpu, void **pages,
+					 int nr_pages, bool overwrite);
+					/* optional */
+
+	/*
+	 * Free pmu-private AUX data structures
+	 */
+	void (*free_aux)		(void *aux); /* optional */
 };
 
 /**
@@ -862,6 +874,11 @@ static inline bool needs_branch_stack(struct perf_event *event)
 	return event->attr.branch_sample_type != 0;
 }
 
+static inline bool has_aux(struct perf_event *event)
+{
+	return event->pmu->setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
-- 
cgit v1.2.3


From 0a4e38e64f5e91ce131cc42ee5bb3925377ec840 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:12 +0200
Subject: perf: Support high-order allocations for AUX space

Some pmus (such as BTS or Intel PT without multiple-entry ToPA capability)
don't support scatter-gather and will prefer larger contiguous areas for
their output regions.

This patch adds a new pmu capability to request higher order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-4-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5a94f6d6fa91..d5a4a8e95808 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -174,6 +174,7 @@ struct perf_event;
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
+#define PERF_PMU_CAP_AUX_NO_SG			0x04
 
 /**
  * struct pmu - generic performance monitoring unit
-- 
cgit v1.2.3


From 6a279230391b63130070e0219b0ad09d34d28c89 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:13 +0200
Subject: perf: Add a capability for AUX_NO_SG pmus to do software double
 buffering

For pmus that don't support scatter-gather for AUX data in hardware, it
might still make sense to implement software double buffering to avoid
losing data while the user is reading data out. For this purpose, add
a pmu capability that guarantees multiple high-order chunks for AUX buffer,
so that the pmu driver can do switchover tricks.

To make use of this feature, add PERF_PMU_CAP_AUX_SW_DOUBLEBUF to your
pmu's capability mask. This will make the ring buffer AUX allocation code
ensure that the biggest high order allocation for the aux buffer pages is
no bigger than half of the total requested buffer size, thus making sure
that the buffer has at least two high order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-5-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5a4a8e95808..13a1eb3a2a2d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -175,6 +175,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
+#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 
 /**
  * struct pmu - generic performance monitoring unit
-- 
cgit v1.2.3


From bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 30 Jan 2015 12:31:06 +0200
Subject: perf: Add a pmu capability for "exclusive" events

Usually, pmus that do, for example, instruction tracing, would only ever
be able to have one event per task per cpu (or per perf_event_context). For
such pmus it makes sense to disallow creating conflicting events early on,
so as to provide consistent behavior for the user.

This patch adds a pmu capability that indicates such constraint on event
creation.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422613866-113186-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 13a1eb3a2a2d..f936a1e51f29 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -176,6 +176,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
+#define PERF_PMU_CAP_EXCLUSIVE			0x10
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -196,6 +197,7 @@ struct pmu {
 
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
+	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
-- 
cgit v1.2.3


From fdc2670666f40ab3e03143f04d1ebf4a05e2c24a Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:16 +0200
Subject: perf: Add API for PMUs to write to the AUX area

For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.

After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.

PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.

Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.

Nested writers are forbidden and guards are in place to catch such
attempts.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f936a1e51f29..45c5873ad9b3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,7 +573,10 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	void				*addr;
+	union {
+		void			*addr;
+		unsigned long		head;
+	};
 	int				page;
 };
 
@@ -608,6 +611,14 @@ perf_cgroup_from_task(struct task_struct *task)
 
 #ifdef CONFIG_PERF_EVENTS
 
+extern void *perf_aux_output_begin(struct perf_output_handle *handle,
+				   struct perf_event *event);
+extern void perf_aux_output_end(struct perf_output_handle *handle,
+				unsigned long size, bool truncated);
+extern int perf_aux_output_skip(struct perf_output_handle *handle,
+				unsigned long size);
+extern void *perf_get_aux(struct perf_output_handle *handle);
+
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
@@ -898,6 +909,17 @@ extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
+static inline void *
+perf_aux_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event)				{ return NULL; }
+static inline void
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+		    bool truncated)					{ }
+static inline int
+perf_aux_output_skip(struct perf_output_handle *handle,
+		     unsigned long size)				{ return -EINVAL; }
+static inline void *
+perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
-- 
cgit v1.2.3


From ec0d7729bbaed4b9d2d3fada693278e13a3d1368 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:23 +0200
Subject: perf: Add ITRACE_START record to indicate that tracing has started

For counters that generate AUX data that is bound to the context of a
running task, such as instruction tracing, the decoder needs to know
exactly which task is running when the event is first scheduled in,
before the first sched_switch. The decoder's need to know this stems
from the fact that instruction flow trace decoding will almost always
require program's object code in order to reconstruct said flow and
for that we need at least its pid/tid in the perf stream.

To single out such instruction tracing pmus, this patch introduces
ITRACE PMU capability. The reason this is not part of RECORD_AUX
record is that not all pmus capable of generating AUX data need this,
and the opposite is *probably* also true.

While sched_switch covers for most cases, there are two problems with it:
the consumer will need to process events out of order (that is, having
found RECORD_AUX, it will have to skip forward to the nearest sched_switch
to figure out which task it was, then go back to the actual trace to
decode it) and it completely misses the case when the tracing is enabled
and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 45c5873ad9b3..61992cf2e977 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,9 @@ struct hw_perf_event {
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
 		};
+		struct { /* itrace */
+			int			itrace_started;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -177,6 +180,7 @@ struct perf_event;
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 #define PERF_PMU_CAP_EXCLUSIVE			0x10
+#define PERF_PMU_CAP_ITRACE			0x20
 
 /**
  * struct pmu - generic performance monitoring unit
-- 
cgit v1.2.3


From b3738d29323344da3017a91010530cf3a58590fc Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:03 +0100
Subject: watchdog: Add watchdog enable/disable all functions

This patch adds two new functions to enable/disable
the watchdog across all CPUs.

This will be used by the HT PMU bug workaround code to
disable/enable the NMI watchdog across quirk enablement.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1416251225-17721-12-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/watchdog.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 395b70e0eccf..a746bf5216f8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
+#else
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
+#endif
+
 #endif  /* ifndef _LINUX_WATCHDOG_H */
-- 
cgit v1.2.3