From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..0efbd6cc2966 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
 static void x86_pmu_flush_branch_stack(void)
 {
 	if (x86_pmu.flush_branch_stack)
@@ -1950,6 +1956,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
+	.sched_task		= x86_pmu_sched_task,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
-- 
cgit v1.2.3


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0efbd6cc2966..6b1fd26a37cf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
-static void x86_pmu_flush_branch_stack(void)
-{
-	if (x86_pmu.flush_branch_stack)
-		x86_pmu.flush_branch_stack();
-}
-
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -1955,7 +1949,6 @@ static struct pmu pmu = {
 	.commit_txn		= x86_pmu_commit_txn,
 
 	.event_idx		= x86_pmu_event_idx,
-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 	.sched_task		= x86_pmu_sched_task,
 };
 
-- 
cgit v1.2.3


From e18bf526422769611e7248135e36a4cea0e4e38d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:03 -0500
Subject: perf/x86/intel: Allocate space for storing LBR stack

When the LBR call stack is enabled, it is necessary to save/restore
the LBR stack on context switch. We can use pmu specific data to
store LBR stack when task is scheduled out. This patch adds code
that allocates the pmu specific data.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-8-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6b1fd26a37cf..8ffd71ec2173 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -432,6 +432,9 @@ int x86_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1950,6 +1953,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
-- 
cgit v1.2.3


From 4b854900995194601d767fcd112307b21ed278b2 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:08 -0500
Subject: perf/x86/intel: Re-organize code that implicitly enables LBR/PEBS

Make later patch more readable, no logic change.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-13-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8ffd71ec2173..e0dab5ce61e9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -399,36 +399,35 @@ int x86_pmu_hw_config(struct perf_event *event)
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
-		/*
-		 * check that PEBS LBR correction does not conflict with
-		 * whatever the user is asking with attr->branch_sample_type
-		 */
-		if (event->attr.precise_ip > 1 &&
-		    x86_pmu.intel_cap.pebs_format < 2) {
-			u64 *br_type = &event->attr.branch_sample_type;
-
-			if (has_branch_stack(event)) {
-				if (!precise_br_compat(event))
-					return -EOPNOTSUPP;
-
-				/* branch_sample_type is compatible */
-
-			} else {
-				/*
-				 * user did not specify  branch_sample_type
-				 *
-				 * For PEBS fixups, we capture all
-				 * the branches at the priv level of the
-				 * event.
-				 */
-				*br_type = PERF_SAMPLE_BRANCH_ANY;
-
-				if (!event->attr.exclude_user)
-					*br_type |= PERF_SAMPLE_BRANCH_USER;
-
-				if (!event->attr.exclude_kernel)
-					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-			}
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
+
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
+
+			/* branch_sample_type is compatible */
+
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 		}
 	}
 
-- 
cgit v1.2.3


From c56716af8d27ca8dd6e45445ae1c0a05fd9753a6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Mar 2015 18:33:28 -0700
Subject: x86/asm/entry, perf: Fix incorrect TIF_IA32 check in
 code_segment_base()

We want to check whether user code is in 32-bit mode, not
whether the task is nominally 32-bit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/33e5107085ce347a8303560302b15c2cadd62c4c.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..979963bb3977 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2161,10 +2161,9 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (user_mode(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
-	if (test_thread_flag(TIF_IA32)) {
-		if (user_mode(regs) && regs->cs != __USER32_CS)
-			return get_segment_base(regs->cs);
-	}
+	if (user_mode(regs) && !user_64bit_mode(regs) &&
+	    regs->cs != __USER32_CS)
+		return get_segment_base(regs->cs);
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From 383f3af3f88aadafe1fcf1948987ad538683fb8c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Mar 2015 18:33:30 -0700
Subject: x86/asm/entry, perf: Explicitly optimize vm86 handling in
 code_segment_base()

There's no point in checking the VM bit on 64-bit, and, since
we're explicitly checking it, we can use user_mode_ignore_vm86()
after the check.

While we're at it, rearrange the #ifdef slightly to make the code
flow a bit clearer.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/dc1457a734feccd03a19bb3538a7648582f57cdd.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 979963bb3977..56f7e60ad732 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2146,6 +2146,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  */
 static unsigned long code_segment_base(struct pt_regs *regs)
 {
+	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+
+#ifdef CONFIG_X86_32
 	/*
 	 * If we are in VM86 mode, add the segment offset to convert to a
 	 * linear address.
@@ -2153,12 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (regs->flags & X86_VM_MASK)
 		return 0x10 * regs->cs;
 
-	/*
-	 * For IA32 we look at the GDT/LDT segment base to convert the
-	 * effective IP to a linear address.
-	 */
-#ifdef CONFIG_X86_32
-	if (user_mode(regs) && regs->cs != __USER_CS)
+	if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
 	if (user_mode(regs) && !user_64bit_mode(regs) &&
-- 
cgit v1.2.3


From 294fe0f52a44c6f207211de0686c369a961b5533 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:06 -0800
Subject: perf/x86/intel: Add INST_RETIRED.ALL workarounds

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55:

  http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e0dab5ce61e9..ec6e982fd464 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
-- 
cgit v1.2.3


From 9332d250b4b4f67c633894b311e022e3cf943bd5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Feb 2015 10:45:43 -0700
Subject: perf/x86: Remove redundant calls to perf_pmu_{dis|en}able()

perf_pmu_disable() is called before pmu->add() and perf_pmu_enable() is called
afterwards. No need to call these inside of x86_pmu_add() as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424281543-67335-1-git-send-email-dsahern@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ec6e982fd464..ac41b3ad1fc9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1044,7 +1044,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
 	hwc = &event->hw;
 
-	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -1082,7 +1081,6 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
-- 
cgit v1.2.3


From 55474c48b4726fd3914c1ec47fced0f931729979 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 29 Mar 2015 11:02:34 +0200
Subject: x86/asm/entry: Remove user_mode_ignore_vm86()

user_mode_ignore_vm86() can be used instead of user_mode(), in
places where we have already done a v8086_mode() security
check of ptregs.

But doing this check in the wrong place would be a bug that
could result in security problems, and also the naming still
isn't very clear.

Furthermore, it only affects 32-bit kernels, while most
development happens on 64-bit kernels.

If we replace them with user_mode() checks then the cost is only
a very minor increase in various slowpaths:

   text             data   bss     dec              hex    filename
   10573391         703562 1753042 13029995         c6d26b vmlinux.o.before
   10573423         703562 1753042 13030027         c6d28b vmlinux.o.after

So lets get rid of this distinction once and for all.

Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150329090233.GA1963@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 56f7e60ad732..e2888a3ad1e3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2159,7 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (regs->flags & X86_VM_MASK)
 		return 0x10 * regs->cs;
 
-	if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS)
+	if (user_mode(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
 	if (user_mode(regs) && !user_64bit_mode(regs) &&
-- 
cgit v1.2.3


From 4807034248bed58d49a4f9f450c024e3b0f58577 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:20 +0200
Subject: perf/x86: Mark Intel PT and LBR/BTS as mutually exclusive

Intel PT cannot be used at the same time as LBR or BTS and will cause a
general protection fault if they are used together. In order to avoid
fixing up GPs in the fast path, instead we disallow creating LBR/BTS
events when PT events are present and vice versa.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-12-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0420ebcac116..549d01d6d996 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
 	}
 }
 
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
 static inline int x86_pmu_initialized(void)
 {
 	return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int ret = -EBUSY, i;
+
+	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+		return 0;
+
+	mutex_lock(&pmc_reserve_mutex);
+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+			goto out;
+
+	atomic_inc(&x86_pmu.lbr_exclusive[what]);
+	ret = 0;
+
+out:
+	mutex_unlock(&pmc_reserve_mutex);
+	return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
 int x86_setup_perfctr(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
 		/* BTS is currently only allowed for user-mode. */
 		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
 	}
 
 	hwc->config |= config;
-- 
cgit v1.2.3


From 90413464313e00fe4975f4a0ebf25fe31d01f793 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:06:54 +0100
Subject: perf/x86: Vectorize cpuc->kfree_on_online

Make the cpuc->kfree_on_online a vector to accommodate
more than one entry and add the second entry to be
used by a later patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 549d01d6d996..682ef00727e7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1373,11 +1373,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int ret = NOTIFY_OK;
+	int i, ret = NOTIFY_OK;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		cpuc->kfree_on_online = NULL;
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
 		if (x86_pmu.cpu_prepare)
 			ret = x86_pmu.cpu_prepare(cpu);
 		break;
@@ -1388,7 +1389,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
-		kfree(cpuc->kfree_on_online);
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
 		break;
 
 	case CPU_DYING:
-- 
cgit v1.2.3


From c5362c0c376486afcf3c91d3c2691d348ac1e2fd Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:55 +0100
Subject: perf/x86: Add 3 new scheduling callbacks

This patch adds 3 new PMU model specific callbacks
during the event scheduling done by x86_schedule_events().

  ->start_scheduling():  invoked when entering the schedule routine.
  ->stop_scheduling():   invoked at the end of the schedule routine
  ->commit_scheduling(): invoked for each committed event

To be used optionally by model-specific code.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 682ef00727e7..cd6115867fb8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -784,6 +784,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
 		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
@@ -830,6 +833,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
 	/*
@@ -850,6 +855,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
 	}
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
 	return num ? -EINVAL : 0;
 }
 
-- 
cgit v1.2.3


From 79cba822443a168c8f7f5b853d9c7225a6d5415e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:06:56 +0100
Subject: perf/x86: Add 'index' param to get_event_constraint() callback

This patch adds an index parameter to the get_event_constraint()
x86_pmu callback. It is expected to represent the index of the
event in the cpuc->event_list[] array. When the callback is used
for fake_cpuc (evnet validation), then the index must be -1. The
motivation for passing the index is to use it to index into another
cpuc array.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cd6115867fb8..71755401476c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -789,7 +789,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
 		hwc->constraint = c;
 
 		wmin = min(wmin, c->weight);
@@ -1777,7 +1777,7 @@ static int validate_event(struct perf_event *event)
 	if (IS_ERR(fake_cpuc))
 		return PTR_ERR(fake_cpuc);
 
-	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
 
 	if (!c || !c->weight)
 		ret = -EINVAL;
-- 
cgit v1.2.3


From e979121b1b1556e184492e6fc149bbe188fc83e6 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Date: Mon, 17 Nov 2014 20:06:58 +0100
Subject: perf/x86/intel: Implement cross-HT corruption bug workaround

This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:

  - SandyBridge: BJ122
  - IvyBridge: BV98
  - Haswell: HSD29

The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.

There is no HW or FW workaround for this problem.

The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        139 277 291      r20cc
       10,000969126 seconds time elapsed

In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.

This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.

The same example run with the workaround enabled, yield the correct answer:

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        0 r20cc
       10,000969126 seconds time elapsed

The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.

The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.

As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.

Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.

This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 71755401476c..b8b7a1277d8d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -779,7 +779,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	struct perf_event *e;
-	int i, wmin, wmax, num = 0;
+	int i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
@@ -822,14 +822,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	/* slow path */
 	if (i != n)
-		num = perf_assign_events(cpuc->event_list, n, wmin,
-					 wmax, assign);
+		unsched = perf_assign_events(cpuc->event_list, n, wmin,
+					     wmax, assign);
 
 	/*
-	 * Mark the event as committed, so we do not put_constraint()
-	 * in case new events are added and fail scheduling.
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
 	 */
-	if (!num && assign) {
+	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
@@ -837,11 +843,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
-	/*
-	 * scheduling failed or is just a simulation,
-	 * free resources if necessary
-	 */
-	if (!assign || num) {
+
+	if (!assign || unsched) {
+
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -851,6 +855,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
 				continue;
 
+			/*
+			 * release events that failed scheduling
+			 */
 			if (x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
@@ -859,7 +866,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	if (x86_pmu.stop_scheduling)
 		x86_pmu.stop_scheduling(cpuc);
 
-	return num ? -EINVAL : 0;
+	return unsched ? -EINVAL : 0;
 }
 
 /*
-- 
cgit v1.2.3


From da3e606d885a17525eb18afd423f5c438860b833 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 27 Feb 2015 09:48:31 -0800
Subject: perf/x86: Dump DEBUGCTL in PMU dump

LBRs and LBR freezing are controlled through the DEBUGCTL MSR. So
dump the state of DEBUGCTL too when dumping the PMU state.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b8b7a1277d8d..994737263daa 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1171,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs;
+	u64 pebs, debugctl;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1197,6 +1197,10 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-- 
cgit v1.2.3


From 15fde1101a1aed11958e0d86bc360f01866a74b1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 27 Feb 2015 09:48:32 -0800
Subject: perf/x86: Only dump PEBS register when PEBS has been detected

Technically PEBS_ENABLED is only guaranteed to exist when we
detected PEBS. So add a check for this to the PMU dump function.
I don't think it can happen on a real CPU, but could in a VM.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-4-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel/cpu/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 994737263daa..689e35760924 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1189,14 +1189,16 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
 		if (x86_pmu.lbr_nr) {
 			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-- 
cgit v1.2.3