summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c7
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/intel-family.h68
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/switch_to.h38
-rw-r--r--arch/x86/include/asm/vsyscall.h1
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c36
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c44
-rw-r--r--arch/x86/kernel/tboot.c10
-rw-r--r--arch/x86/kvm/emulate.c7
-rw-r--r--arch/x86/kvm/ioapic.c20
-rw-r--r--arch/x86/kvm/vmx.c4
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lib/delay.c7
-rw-r--r--arch/x86/mm/kaiser.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c13
22 files changed, 259 insertions, 34 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 67ca7e4872f0..ddc3abdf437e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_WITHIN_STACK_FRAMES
+ select HAVE_EBPF_JIT if X86_64
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6145c5..c18806b5db2a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -965,7 +965,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
if (sg_is_last(req->src) &&
req->src->offset + req->src->length <= PAGE_SIZE &&
- sg_is_last(req->dst) &&
+ sg_is_last(req->dst) && req->dst->length &&
req->dst->offset + req->dst->length <= PAGE_SIZE) {
one_entry_in_sg = 1;
scatterwalk_start(&src_sg_walk, req->src);
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 112178b401a1..2d359991a273 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
#else
EMULATE;
#endif
+unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
static int __init vsyscall_setup(char *str)
{
@@ -336,11 +337,11 @@ void __init map_vsyscall(void)
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+ if (vsyscall_mode != NATIVE)
+ vsyscall_pgprot = __PAGE_KERNEL_VVAR;
if (vsyscall_mode != NONE)
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
- vsyscall_mode == NATIVE
- ? PAGE_KERNEL_VSYSCALL
- : PAGE_KERNEL_VVAR);
+ __pgprot(vsyscall_pgprot));
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0fbc98568018..641f0f2c2982 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -199,6 +199,7 @@
#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
#define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
new file mode 100644
index 000000000000..6999f7d01a0d
--- /dev/null
+++ b/arch/x86/include/asm/intel-family.h
@@ -0,0 +1,68 @@
+#ifndef _ASM_X86_INTEL_FAMILY_H
+#define _ASM_X86_INTEL_FAMILY_H
+
+/*
+ * "Big Core" Processors (Branded as Core, Xeon, etc...)
+ *
+ * The "_X" parts are generally the EP and EX Xeons, or the
+ * "Extreme" ones, like Broadwell-E.
+ *
+ * Things ending in "2" are usually because we have no better
+ * name for them. There's no processor called "WESTMERE2".
+ */
+
+#define INTEL_FAM6_CORE_YONAH 0x0E
+#define INTEL_FAM6_CORE2_MEROM 0x0F
+#define INTEL_FAM6_CORE2_MEROM_L 0x16
+#define INTEL_FAM6_CORE2_PENRYN 0x17
+#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
+
+#define INTEL_FAM6_NEHALEM 0x1E
+#define INTEL_FAM6_NEHALEM_EP 0x1A
+#define INTEL_FAM6_NEHALEM_EX 0x2E
+#define INTEL_FAM6_WESTMERE 0x25
+#define INTEL_FAM6_WESTMERE2 0x1F
+#define INTEL_FAM6_WESTMERE_EP 0x2C
+#define INTEL_FAM6_WESTMERE_EX 0x2F
+
+#define INTEL_FAM6_SANDYBRIDGE 0x2A
+#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
+#define INTEL_FAM6_IVYBRIDGE 0x3A
+#define INTEL_FAM6_IVYBRIDGE_X 0x3E
+
+#define INTEL_FAM6_HASWELL_CORE 0x3C
+#define INTEL_FAM6_HASWELL_X 0x3F
+#define INTEL_FAM6_HASWELL_ULT 0x45
+#define INTEL_FAM6_HASWELL_GT3E 0x46
+
+#define INTEL_FAM6_BROADWELL_CORE 0x3D
+#define INTEL_FAM6_BROADWELL_XEON_D 0x56
+#define INTEL_FAM6_BROADWELL_GT3E 0x47
+#define INTEL_FAM6_BROADWELL_X 0x4F
+
+#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E
+#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E
+#define INTEL_FAM6_SKYLAKE_X 0x55
+#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E
+#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E
+
+/* "Small Core" Processors (Atom) */
+
+#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
+#define INTEL_FAM6_ATOM_LINCROFT 0x26
+#define INTEL_FAM6_ATOM_PENWELL 0x27
+#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
+#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
+#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
+#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
+#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
+#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */
+#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */
+#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
+#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
+
+/* Xeon Phi */
+
+#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
+
+#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d2abb2a41d2..74fda1a453bd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -998,7 +998,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
static inline int emulate_instruction(struct kvm_vcpu *vcpu,
int emulation_type)
{
- return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+ return x86_emulate_instruction(vcpu, 0,
+ emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0);
}
void kvm_enable_efer_bits(u64);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c124d6ab4bf9..86bccb4bd4dc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -574,7 +574,7 @@ static inline void sync_core(void)
{
int tmp;
-#ifdef CONFIG_M486
+#ifdef CONFIG_X86_32
/*
* Do a CPUID if available, otherwise do a jump. The jump
* can conveniently enough be the jump around CPUID.
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b7bf11..025ecfaba9c9 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H
+#include <asm/nospec-branch.h>
+
struct task_struct; /* one of the stranger aspects of C forward declarations */
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
@@ -24,6 +26,23 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
#define __switch_canary_iparam
#endif /* CC_STACKPROTECTOR */
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+#define __retpoline_fill_return_buffer \
+ ALTERNATIVE("jmp 910f", \
+ __stringify(__FILL_RETURN_BUFFER(%%ebx, RSB_CLEAR_LOOPS, %%esp)),\
+ X86_FEATURE_RSB_CTXSW) \
+ "910:\n\t"
+#else
+#define __retpoline_fill_return_buffer
+#endif
+
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
@@ -46,6 +65,7 @@ do { \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
"pushl %[next_ip]\n\t" /* restore EIP */ \
__switch_canary \
+ __retpoline_fill_return_buffer \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP */ \
@@ -100,6 +120,23 @@ do { \
#define __switch_canary_iparam
#endif /* CC_STACKPROTECTOR */
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+#define __retpoline_fill_return_buffer \
+ ALTERNATIVE("jmp 910f", \
+ __stringify(__FILL_RETURN_BUFFER(%%r12, RSB_CLEAR_LOOPS, %%rsp)),\
+ X86_FEATURE_RSB_CTXSW) \
+ "910:\n\t"
+#else
+#define __retpoline_fill_return_buffer
+#endif
+
/*
* There is no need to save or restore flags, because flags are always
* clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
@@ -112,6 +149,7 @@ do { \
"call __switch_to\n\t" \
"movq "__percpu_arg([current_task])",%%rsi\n\t" \
__switch_canary \
+ __retpoline_fill_return_buffer \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
"movq %%rax,%%rdi\n\t" \
"testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 4865e10dbb55..9ee85066f407 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -13,6 +13,7 @@ extern void map_vsyscall(void);
*/
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
extern bool vsyscall_enabled(void);
+extern unsigned long vsyscall_pgprot;
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fc91c98bee01..fd945099fc95 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2592,8 +2592,8 @@ static struct resource * __init ioapic_setup_resources(void)
res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
+ ioapics[i].iomem_res = &res[num];
num++;
- ioapics[i].iomem_res = res;
}
ioapic_resources = res;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 49d25ddf0e9f..8cacf62ec458 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,6 +22,7 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
+#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
@@ -154,6 +155,23 @@ disable:
return SPECTRE_V2_CMD_NONE;
}
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ return true;
+ }
+ }
+ return false;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -212,6 +230,24 @@ retpoline_auto:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If neither SMEP or KPTI are available, there is a risk of
+ * hitting userspace addresses in the RSB after a context switch
+ * from a shallow call stack to a deeper one. To prevent this fill
+ * the entire RSB, even when using IBRS.
+ *
+ * Skylake era CPUs have a separate issue with *underflow* of the
+ * RSB, when they will predict 'ret' targets from the generic BTB.
+ * The proper mitigation for this is IBRS. If IBRS is not supported
+ * or deactivated in favour of retpolines the RSB fill on context
+ * switch is required.
+ */
+ if ((!boot_cpu_has(X86_FEATURE_KAISER) &&
+ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Filling RSB on context switch\n");
+ }
}
#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index e38d338a6447..b4ca91cf55b0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu)
ci_leaf_init(this_leaf++, &id4_regs);
__cache_cpumap_setup(cpu, idx, &id4_regs);
}
+ this_cpu_ci->cpu_map_populated = true;
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index b428a8174be1..2c76a1801393 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,6 +39,9 @@
#include <asm/setup.h>
#include <asm/msr.h>
+/* last level cache size per core */
+static int llc_size_per_core;
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -996,15 +999,18 @@ static bool is_blacklisted(unsigned int cpu)
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
- * may result in a system hang. This behavior is documented in item
- * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
*/
if (c->x86 == 6 &&
c->x86_model == 79 &&
c->x86_mask == 0x01 &&
+ llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
}
return false;
@@ -1067,6 +1073,15 @@ static struct microcode_ops microcode_intel_ops = {
.microcode_fini_cpu = microcode_fini_cpu,
};
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024;
+
+ do_div(llc_size, c->x86_max_cores);
+
+ return (int)llc_size;
+}
+
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1077,6 +1092,8 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
+ llc_size_per_core = calc_llc_size_per_core(c);
+
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 2cad71d1b14c..5af11c46d0b9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -22,6 +22,7 @@
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/coredump.h>
+#include <linux/kaiser.h>
#include <asm-generic/sizes.h>
#include <asm/perf_event.h>
@@ -67,6 +68,23 @@ static size_t buf_size(struct page *page)
return 1 << (PAGE_SHIFT + page_private(page));
}
+static void bts_buffer_free_aux(void *data)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct bts_buffer *buf = data;
+ int nbuf;
+
+ for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) {
+ struct page *page = buf->buf[nbuf].page;
+ void *kaddr = page_address(page);
+ size_t page_size = buf_size(page);
+
+ kaiser_remove_mapping((unsigned long)kaddr, page_size);
+ }
+#endif
+ kfree(data);
+}
+
static void *
bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
{
@@ -103,29 +121,33 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
buf->real_size = size - size % BTS_RECORD_SIZE;
for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
- unsigned int __nr_pages;
+ void *kaddr = pages[pg];
+ size_t page_size;
+
+ page = virt_to_page(kaddr);
+ page_size = buf_size(page);
+
+ if (kaiser_add_mapping((unsigned long)kaddr,
+ page_size, __PAGE_KERNEL) < 0) {
+ buf->nr_bufs = nbuf;
+ bts_buffer_free_aux(buf);
+ return NULL;
+ }
- page = virt_to_page(pages[pg]);
- __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
buf->buf[nbuf].page = page;
buf->buf[nbuf].offset = offset;
buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
- buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement;
pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
buf->buf[nbuf].size -= pad;
- pg += __nr_pages;
- offset += __nr_pages << PAGE_SHIFT;
+ pg += page_size >> PAGE_SHIFT;
+ offset += page_size;
}
return buf;
}
-static void bts_buffer_free_aux(void *data)
-{
- kfree(data);
-}
-
static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
{
return buf->buf[idx].offset + buf->buf[idx].displacement;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496db434..c77ab1f51fbe 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -140,6 +140,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
pte_unmap(pte);
+
+ /*
+ * PTI poisons low addresses in the kernel page tables in the
+ * name of making them unusable for userspace. To execute
+ * code at such a low address, the poison must be cleared.
+ *
+ * Note: 'pgd' actually gets set in pud_alloc().
+ */
+ pgd->pgd &= ~_PAGE_NX;
+
return 0;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 00045499f6c2..e4eb1d2bf849 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4978,6 +4978,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
bool op_prefix = false;
bool has_seg_override = false;
struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
ctxt->memop.type = OP_NONE;
ctxt->memopp = NULL;
@@ -4996,6 +4998,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
switch (mode) {
case X86EMUL_MODE_REAL:
case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
case X86EMUL_MODE_PROT16:
def_op_bytes = def_ad_bytes = 2;
break;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 3aab53f8cad2..d380111351c0 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -247,8 +247,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
e->fields.dest_id, e->fields.dest_mode) ||
- (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
- kvm_apic_pending_eoi(vcpu, e->fields.vector)))
+ kvm_apic_pending_eoi(vcpu, e->fields.vector))
__set_bit(e->fields.vector,
(unsigned long *)eoi_exit_bitmap);
}
@@ -269,6 +268,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
bool mask_before, mask_after;
+ int old_remote_irr, old_delivery_status;
union kvm_ioapic_redirect_entry *e;
switch (ioapic->ioregsel) {
@@ -291,14 +291,28 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
return;
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
if (ioapic->ioregsel & 1) {
e->bits &= 0xffffffff;
e->bits |= (u64) val << 32;
} else {
e->bits &= ~0xffffffffULL;
e->bits |= (u32) val;
- e->fields.remote_irr = 0;
}
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75d60e40c389..f8d785aa2e96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4954,7 +4954,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
}
- vmcs_writel(GUEST_RFLAGS, 0x02);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -6023,7 +6023,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
return 1;
- err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
+ err = emulate_instruction(vcpu, 0);
if (err == EMULATE_USER_EXIT) {
++vcpu->stat.mmio_exits;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f973cfa8ff4f..3900d34980de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5153,7 +5153,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
- r = EMULATE_FAIL;
+ r = EMULATE_USER_EXIT;
}
kvm_queue_exception(vcpu, UD_VECTOR);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e912b2f6d36e..45772560aceb 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops)
{
u64 start, end, delay, loops = __loops;
+ /*
+ * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+ * is a store on the memory monitored by MONITORX.
+ */
+ if (loops == 0)
+ return;
+
start = rdtsc_ordered();
for (;;) {
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 8af98513d36c..2298434f7bdb 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -345,7 +345,7 @@ void __init kaiser_init(void)
if (vsyscall_enabled())
kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
PAGE_SIZE,
- __PAGE_KERNEL_VSYSCALL);
+ vsyscall_pgprot);
for_each_possible_cpu(cpu) {
void *percpu_vaddr = __per_cpu_user_mapped_start +
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 75991979f667..33c42b826791 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -266,10 +266,10 @@ static void emit_bpf_tail_call(u8 **pprog)
/* if (index >= array->map.max_entries)
* goto out;
*/
- EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */
+ EMIT2(0x89, 0xD2); /* mov edx, edx */
+ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
offsetof(struct bpf_array, map.max_entries));
- EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */
-#define OFFSET1 47 /* number of bytes to jump */
+#define OFFSET1 43 /* number of bytes to jump */
EMIT2(X86_JBE, OFFSET1); /* jbe out */
label1 = cnt;
@@ -278,21 +278,20 @@ static void emit_bpf_tail_call(u8 **pprog)
*/
EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 36
+#define OFFSET2 32
EMIT2(X86_JA, OFFSET2); /* ja out */
label2 = cnt;
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
/* prog = array->ptrs[index]; */
- EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
+ EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
offsetof(struct bpf_array, ptrs));
- EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
/* if (prog == NULL)
* goto out;
*/
- EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
#define OFFSET3 10
EMIT2(X86_JE, OFFSET3); /* je out */
label3 = cnt;