From cc3a1bf52a9d2808c7cd6e8f413b02b650b6b84b Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 25 Oct 2011 19:51:59 +0530
Subject: x86: Clean up and extend do_int3()

Since there is a possibility of !KPROBES int3 listeners
(such as kgdb) and since DIE_TRAP is currently not being
used by anybody, notify all listeners with DIE_INT3.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111025142159.GB21225@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..fa1191fb679d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 			== NOTIFY_STOP)
 		return;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
 		return;
-#else
-	if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
-			== NOTIFY_STOP)
-		return;
-#endif
 
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-- 
cgit v1.2.3


From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 9 Dec 2011 03:02:19 -0500
Subject: x86: Keep current stack in NMI breakpoints

We want to allow NMI handlers to have breakpoints to be able to
remove stop_machine from ftrace, kprobes and jump_labels. But if
an NMI interrupts a current breakpoint, and then it triggers a
breakpoint itself, it will switch to the breakpoint stack and
corrupt the data on it for the breakpoint processing that it
interrupted.

Instead, have the NMI check if it interrupted breakpoint processing
by checking if the stack that is currently used is a breakpoint
stack. If it is, then load a special IDT that changes the IST
for the debug exception to keep the same stack in kernel context.
When the NMI is done, it puts it back.

This way, if the NMI does trigger a breakpoint, it will keep
using the same stack and not stomp on the breakpoint data for
the breakpoint it interrupted.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/traps.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..a93c5cabc36a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -723,4 +723,10 @@ void __init trap_init(void)
 	cpu_init();
 
 	x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+	set_nmi_gate(1, &debug);
+	set_nmi_gate(3, &int3);
+#endif
 }
-- 
cgit v1.2.3


From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 11:43:02 -0500
Subject: x86: Add counter when debug stack is used with interrupts enabled

Mathieu Desnoyers pointed out a case that can cause issues with
NMIs running on the debug stack:

  int3 -> interrupt -> NMI -> int3

Because the interrupt changes the stack, the NMI will not see that
it preempted the debug stack. Looking deeper at this case,
interrupts only happen when the int3 is from userspace or in
an a location in the exception table (fixup).

  userspace -> int3 -> interurpt -> NMI -> int3

All other int3s that happen in the kernel should be processed
without ever enabling interrupts, as the do_trap() call will
panic the kernel if it is called to process any other location
within the kernel.

Adding a counter around the sections that enable interrupts while
using the debug stack allows the NMI to also check that case.
If the NMI sees that it either interrupted a task using the debug
stack or the debug counter is non-zero, then it will have to
change the IDT table to make the int3 not change stacks (which will
corrupt the stack if it does).

Note, I had to move the debug_usage functions out of processor.h
and into debugreg.h because of the static inlined functions to
inc and dec the debug_usage counter. __get_cpu_var() requires
smp.h which includes processor.h, and would fail to build.

Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/traps.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a93c5cabc36a..0072b38e3ea1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 }
 
 #ifdef CONFIG_X86_64
@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 							SIGTRAP) == NOTIFY_STOP)
 		return;
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
 		preempt_conditional_cli(regs);
+		debug_stack_usage_dec();
 		return;
 	}
 
@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 
 	return;
 }
-- 
cgit v1.2.3


From be98c2cdb15ba26148cd2bd58a857d4f7759ed38 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Feb 2012 13:47:25 -0800
Subject: i387: math_state_restore() isn't called from asm

It was marked asmlinkage for some really old and stale legacy reasons.
Fix that and the equally stale comment.

Noticed when debugging the irq_fpu_usable() bugs.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 482ec3af2067..982433b5da30 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -599,10 +599,10 @@ void __math_state_restore(void)
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
  * Don't touch unless you *really* know how it works.
  *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
  */
-asmlinkage void math_state_restore(void)
+void math_state_restore(void)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
-- 
cgit v1.2.3


From 5b1cbac37798805c1fee18c8cebe5c0a13975b17 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Feb 2012 13:56:14 -0800
Subject: i387: make irq_fpu_usable() tests more robust

Some code - especially the crypto layer - wants to use the x86
FP/MMX/AVX register set in what may be interrupt (typically softirq)
context.

That *can* be ok, but the tests for when it was ok were somewhat
suspect.  We cannot touch the thread-specific status bits either, so
we'd better check that we're not going to try to save FP state or
anything like that.

Now, it may be that the TS bit is always cleared *before* we set the
USEDFPU bit (and only set when we had already cleared the USEDFP
before), so the TS bit test may actually have been sufficient, but it
certainly was not obviously so.

So this explicitly verifies that we will not touch the TS_USEDFPU bit,
and adds a few related sanity-checks.  Because it seems that somehow
AES-NI is corrupting user FP state.  The cause is not clear, and this
patch doesn't fix it, but while debugging it I really wanted the code to
be more obviously correct and robust.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 982433b5da30..8ba27dbc107a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -631,6 +631,7 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
+	WARN_ON_ONCE(!user_mode_vm(regs));
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
-- 
cgit v1.2.3


From 15d8791cae75dca27bfda8ecfe87dca9379d6bb0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 09:15:04 -0800
Subject: i387: fix x86-64 preemption-unsafe user stack save/restore

Commit 5b1cbac37798 ("i387: make irq_fpu_usable() tests more robust")
added a sanity check to the #NM handler to verify that we never cause
the "Device Not Available" exception in kernel mode.

However, that check actually pinpointed a (fundamental) race where we do
cause that exception as part of the signal stack FPU state save/restore
code.

Because we use the floating point instructions themselves to save and
restore state directly from user mode, we cannot do that atomically with
testing the TS_USEDFPU bit: the user mode access itself may cause a page
fault, which causes a task switch, which saves and restores the FP/MMX
state from the kernel buffers.

This kind of "recursive" FP state save is fine per se, but it means that
when the signal stack save/restore gets restarted, it will now take the
'#NM' exception we originally tried to avoid.  With preemption this can
happen even without the page fault - but because of the user access, we
cannot just disable preemption around the save/restore instruction.

There are various ways to solve this, including using the
"enable/disable_page_fault()" helpers to not allow page faults at all
during the sequence, and fall back to copying things by hand without the
use of the native FP state save/restore instructions.

However, the simplest thing to do is to just allow the #NM from kernel
space, but fix the race in setting and clearing CR0.TS that this all
exposed: the TS bit changes and the TS_USEDFPU bit absolutely have to be
atomic wrt scheduling, so while the actual state save/restore can be
interrupted and restarted, the act of actually clearing/setting CR0.TS
and the TS_USEDFPU bit together must not.

Instead of just adding random "preempt_disable/enable()" calls to what
is already excessively ugly code, this introduces some helper functions
that mostly mirror the "kernel_fpu_begin/end()" functionality, just for
the user state instead.

Those helper functions should probably eventually replace the other
ad-hoc CR0.TS and TS_USEDFPU tests too, but I'll need to think about it
some more: the task switching functionality in particular needs to
expose the difference between the 'prev' and 'next' threads, while the
new helper functions intentionally were written to only work with
'current'.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8ba27dbc107a..982433b5da30 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -631,7 +631,6 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	WARN_ON_ONCE(!user_mode_vm(regs));
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
-- 
cgit v1.2.3


From 6d59d7a9f5b723a7ac1925c136e93ec83c0c3043 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 13:33:12 -0800
Subject: i387: don't ever touch TS_USEDFPU directly, use helper functions

This creates three helper functions that do the TS_USEDFPU accesses, and
makes everybody that used to do it by hand use those helpers instead.

In addition, there's a couple of helper functions for the "change both
CR0.TS and TS_USEDFPU at the same time" case, and the places that do
that together have been changed to use those.  That means that we have
fewer random places that open-code this situation.

The intent is partly to clarify the code without actually changing any
semantics yet (since we clearly still have some hard to reproduce bug in
this area), but also to make it much easier to use another approach
entirely to caching the CR0.TS bit for software accesses.

Right now we use a bit in the thread-info 'status' variable (this patch
does not change that), but we might want to make it a full field of its
own or even make it a per-cpu variable.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 982433b5da30..fc676e44c77f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -588,7 +588,7 @@ void __math_state_restore(void)
 		return;
 	}
 
-	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	__thread_set_has_fpu(thread);	/* clts in caller! */
 	tsk->fpu_counter++;
 }
 
-- 
cgit v1.2.3


From b3b0870ef3ffed72b92415423da864f440f57ad6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 15:45:23 -0800
Subject: i387: do not preload FPU state at task switch time

Yes, taking the trap to re-load the FPU/MMX state is expensive, but so
is spending several days looking for a bug in the state save/restore
code.  And the preload code has some rather subtle interactions with
both paravirtualization support and segment state restore, so it's not
nearly as simple as it should be.

Also, now that we no longer necessarily depend on a single bit (ie
TS_USEDFPU) for keeping track of the state of the FPU, we migth be able
to do better.  If we are really switching between two processes that
keep touching the FP state, save/restore is inevitable, but in the case
of having one process that does most of the FPU usage, we may actually
be able to do much better than the preloading.

In particular, we may be able to keep track of which CPU the process ran
on last, and also per CPU keep track of which process' FP state that CPU
has.  For modern CPU's that don't destroy the FPU contents on save time,
that would allow us to do a lazy restore by just re-enabling the
existing FPU state - with no restore cost at all!

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index fc676e44c77f..5afe824c66e5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,28 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
-/*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use.  Used during context switch.
- */
-void __math_state_restore(void)
-{
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = thread->task;
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		stts();
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
-
-	__thread_set_has_fpu(thread);	/* clts in caller! */
-	tsk->fpu_counter++;
-}
-
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -622,9 +600,18 @@ void math_state_restore(void)
 		local_irq_disable();
 	}
 
-	clts();				/* Allow maths ops (or we recurse) */
+	__thread_fpu_begin(thread);
 
-	__math_state_restore();
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(thread);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+
+	tsk->fpu_counter++;
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
-- 
cgit v1.2.3


From 4903062b5485f0e2c286a23b44c9b59d9b017d53 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 19:11:15 -0800
Subject: i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to
 restore

The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
pending.  In order to not leak FIP state from one process to another, we
need to do a floating point load after the fxsave of the old process,
and before the fxrstor of the new FPU state.  That resets the state to
the (uninteresting) kernel load, rather than some potentially sensitive
user information.

We used to do this directly after the FPU state save, but that is
actually very inconvenient, since it

 (a) corrupts what is potentially perfectly good FPU state that we might
     want to lazy avoid restoring later and

 (b) on x86-64 it resulted in a very annoying ordering constraint, where
     "__unlazy_fpu()" in the task switch needs to be delayed until after
     the DS segment has been reloaded just to get the new DS value.

Coupling it to the fxrstor instead of the fxsave automatically avoids
both of these issues, and also ensures that we only do it when actually
necessary (the FP state after a save may never actually get used).  It's
simply a much more natural place for the leaked state cleanup.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5afe824c66e5..4d42300dcd2c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -585,6 +585,10 @@ void math_state_restore(void)
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
 
+	/* We need a safe address that is cheap to find and that is already
+	   in L1. We just brought in "thread->task", so use that */
+#define safe_address (thread->task)
+
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
 		/*
@@ -602,6 +606,16 @@ void math_state_restore(void)
 
 	__thread_fpu_begin(thread);
 
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+
 	/*
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
-- 
cgit v1.2.3


From f94edacf998516ac9d849f7bc6949a703977a7f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 17 Feb 2012 21:48:54 -0800
Subject: i387: move TS_USEDFPU flag from thread_info to task_struct

This moves the bit that indicates whether a thread has ownership of the
FPU from the TS_USEDFPU bit in thread_info->status to a word of its own
(called 'has_fpu') in task_struct->thread.has_fpu.

This fixes two independent bugs at the same time:

 - changing 'thread_info->status' from the scheduler causes nasty
   problems for the other users of that variable, since it is defined to
   be thread-synchronous (that's what the "TS_" part of the naming was
   supposed to indicate).

   So perfectly valid code could (and did) do

	ti->status |= TS_RESTORE_SIGMASK;

   and the compiler was free to do that as separate load, or and store
   instructions.  Which can cause problems with preemption, since a task
   switch could happen in between, and change the TS_USEDFPU bit. The
   change to TS_USEDFPU would be overwritten by the final store.

   In practice, this seldom happened, though, because the 'status' field
   was seldom used more than once, so gcc would generally tend to
   generate code that used a read-modify-write instruction and thus
   happened to avoid this problem - RMW instructions are naturally low
   fat and preemption-safe.

 - On x86-32, the current_thread_info() pointer would, during interrupts
   and softirqs, point to a *copy* of the real thread_info, because
   x86-32 uses %esp to calculate the thread_info address, and thus the
   separate irq (and softirq) stacks would cause these kinds of odd
   thread_info copy aliases.

   This is normally not a problem, since interrupts aren't supposed to
   look at thread information anyway (what thread is running at
   interrupt time really isn't very well-defined), but it confused the
   heck out of irq_fpu_usable() and the code that tried to squirrel
   away the FPU state.

   (It also caused untold confusion for us poor kernel developers).

It also turns out that using 'task_struct' is actually much more natural
for most of the call sites that care about the FPU state, since they
tend to work with the task struct for other reasons anyway (ie
scheduling).  And the FPU data that we are going to save/restore is
found there too.

Thanks to Arjan Van De Ven <arjan@linux.intel.com> for pointing us to
the %esp issue.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Reported-and-tested-by: Raphael Prevost <raphael@buro.asia>
Acked-and-tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4d42300dcd2c..ad25e51f40c4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -582,12 +582,11 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
  */
 void math_state_restore(void)
 {
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = thread->task;
+	struct task_struct *tsk = current;
 
 	/* We need a safe address that is cheap to find and that is already
-	   in L1. We just brought in "thread->task", so use that */
-#define safe_address (thread->task)
+	   in L1. We're just bringing in "tsk->thread.has_fpu", so use that */
+#define safe_address (tsk->thread.has_fpu)
 
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
@@ -604,7 +603,7 @@ void math_state_restore(void)
 		local_irq_disable();
 	}
 
-	__thread_fpu_begin(thread);
+	__thread_fpu_begin(tsk);
 
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
@@ -620,7 +619,7 @@ void math_state_restore(void)
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
 	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(thread);
+		__thread_fpu_end(tsk);
 		force_sig(SIGSEGV, tsk);
 		return;
 	}
-- 
cgit v1.2.3


From 34ddc81a230b15c0e345b6b253049db731499f7e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 18 Feb 2012 12:56:35 -0800
Subject: i387: re-introduce FPU state preloading at context switch time

After all the FPU state cleanups and finally finding the problem that
caused all our FPU save/restore problems, this re-introduces the
preloading of FPU state that was removed in commit b3b0870ef3ff ("i387:
do not preload FPU state at task switch time").

However, instead of simply reverting the removal, this reimplements
preloading with several fixes, most notably

 - properly abstracted as a true FPU state switch, rather than as
   open-coded save and restore with various hacks.

   In particular, implementing it as a proper FPU state switch allows us
   to optimize the CR0.TS flag accesses: there is no reason to set the
   TS bit only to then almost immediately clear it again.  CR0 accesses
   are quite slow and expensive, don't flip the bit back and forth for
   no good reason.

 - Make sure that the same model works for both x86-32 and x86-64, so
   that there are no gratuitous differences between the two due to the
   way they save and restore segment state differently due to
   architectural differences that really don't matter to the FPU state.

 - Avoid exposing the "preload" state to the context switch routines,
   and in particular allow the concept of lazy state restore: if nothing
   else has used the FPU in the meantime, and the process is still on
   the same CPU, we can avoid restoring state from memory entirely, just
   re-expose the state that is still in the FPU unit.

   That optimized lazy restore isn't actually implemented here, but the
   infrastructure is set up for it.  Of course, older CPU's that use
   'fnsave' to save the state cannot take advantage of this, since the
   state saving also trashes the state.

In other words, there is now an actual _design_ to the FPU state saving,
rather than just random historical baggage.  Hopefully it's easier to
follow as a result.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 55 ++++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ad25e51f40c4..77da5b475ad2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,6 +570,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
+/*
+ * This gets called with the process already owning the
+ * FPU state, and with CR0.TS cleared. It just needs to
+ * restore the FPU register state.
+ */
+void __math_state_restore(struct task_struct *tsk)
+{
+	/* We need a safe address that is cheap to find and that is already
+	   in L1. We've just brought in "tsk->thread.has_fpu", so use that */
+#define safe_address (tsk->thread.has_fpu)
+
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+}
+
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -584,10 +615,6 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	/* We need a safe address that is cheap to find and that is already
-	   in L1. We're just bringing in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
 		/*
@@ -604,25 +631,7 @@ void math_state_restore(void)
 	}
 
 	__thread_fpu_begin(tsk);
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(tsk);
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
+	__math_state_restore(tsk);
 
 	tsk->fpu_counter++;
 }
-- 
cgit v1.2.3


From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 11:48:44 -0800
Subject: i387: use 'restore_fpu_checking()' directly in task switching code

This inlines what is usually just a couple of instructions, but more
importantly it also fixes the theoretical error case (can that FPU
restore really ever fail? Maybe we should remove the checking).

We can't start sending signals from within the scheduler, we're much too
deep in the kernel and are holding the runqueue lock etc.  So don't
bother even trying.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps.c | 40 ++++++++--------------------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel/traps.c')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 77da5b475ad2..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,37 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
-/*
- * This gets called with the process already owning the
- * FPU state, and with CR0.TS cleared. It just needs to
- * restore the FPU register state.
- */
-void __math_state_restore(struct task_struct *tsk)
-{
-	/* We need a safe address that is cheap to find and that is already
-	   in L1. We've just brought in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(tsk);
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
-}
-
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -631,7 +600,14 @@ void math_state_restore(void)
 	}
 
 	__thread_fpu_begin(tsk);
-	__math_state_restore(tsk);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
 
 	tsk->fpu_counter++;
 }
-- 
cgit v1.2.3