diff options
Diffstat (limited to 'arch/x86')
140 files changed, 1242 insertions, 612 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d5d6f51d74c..38a0643783bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,7 +19,6 @@ config X86 def_bool y select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE @@ -92,7 +91,6 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_WITHIN_STACK_FRAMES - select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 3cb8e179f2f2..056717be912f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -209,7 +209,7 @@ config HAVE_MMIOTRACE_SUPPORT config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL && KPROBES + depends on DEBUG_KERNEL && INSTRUCTION_DECODER depends on !COMPILE_TEST ---help--- Perform x86 instruction decoder selftests at build time. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 0a3081d64855..e94f8ed5eea5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -34,12 +34,13 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) +REALMODE_CFLAGS += $(CLANG_FLAGS) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -61,6 +62,9 @@ endif KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +# Intel CET isn't enabled in the kernel +KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5993813c733d..b9d8d72d397e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -34,6 +34,8 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +# Disable relocation relaxation in case the link is not PIE. +KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064da8da..0eca7f2087b1 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -170,7 +170,7 @@ preferred_addr: notl %eax andl %eax, %ebx cmpl $LOAD_PHYSICAL_ADDR, %ebx - jge 1f + jae 1f #endif movl $LOAD_PHYSICAL_ADDR, %ebx 1: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a199139..ca8151ef3bfa 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -104,7 +104,7 @@ ENTRY(startup_32) notl %eax andl %eax, %ebx cmpl $LOAD_PHYSICAL_ADDR, %ebx - jge 1f + jae 1f #endif movl $LOAD_PHYSICAL_ADDR, %ebx 1: @@ -225,6 +225,11 @@ ENTRY(efi32_stub_entry) leal efi32_config(%ebp), %eax movl %eax, efi_config(%ebp) + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + jmp startup_32 ENDPROC(efi32_stub_entry) #endif @@ -332,7 +337,7 @@ preferred_addr: notq %rax andq %rax, %rbp cmpq $LOAD_PHYSICAL_ADDR, %rbp - jge 1f + jae 1f #endif movq $LOAD_PHYSICAL_ADDR, %rbp 1: diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 028be48c8839..a628b2474e6d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -218,7 +218,6 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y # CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set @@ -248,6 +247,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig index 36efc448ab0e..5a795d313cb5 100644 --- a/arch/x86/configs/i386_ranchu_defconfig +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -139,7 +139,6 @@ CONFIG_NETFILTER_XT_MATCH_MAC=y CONFIG_NETFILTER_XT_MATCH_MARK=y CONFIG_NETFILTER_XT_MATCH_POLICY=y CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_QTAGUID=y CONFIG_NETFILTER_XT_MATCH_QUOTA=y CONFIG_NETFILTER_XT_MATCH_QUOTA2=y CONFIG_NETFILTER_XT_MATCH_SOCKET=y diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 007d0867fd6a..6f099c7218a4 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -18,6 +18,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_SCHEDTUNE=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_CGROUP=y CONFIG_NAMESPACES=y CONFIG_SCHED_TUNE=y CONFIG_BLK_DEV_INITRD=y @@ -48,11 +49,10 @@ CONFIG_PREEMPT=y # CONFIG_MICROCODE is not set CONFIG_X86_MSR=y CONFIG_X86_CPUID=y -CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_ZSMALLOC=y # CONFIG_MTRR is not set +CONFIG_EFI=y CONFIG_HZ_100=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y @@ -60,7 +60,7 @@ CONFIG_PHYSICAL_START=0x200000 CONFIG_RANDOMIZE_BASE=y CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=ttyS0 reboot=p nopti" +CONFIG_CMDLINE="nopti" CONFIG_PM_AUTOSLEEP=y CONFIG_PM_WAKELOCKS=y CONFIG_PM_WAKELOCKS_LIMIT=0 @@ -206,6 +206,7 @@ CONFIG_OF_UNITTEST=y # CONFIG_PNP_DEBUG_MESSAGES is not set CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=16 CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_VIRTIO_BLK=y @@ -228,6 +229,7 @@ CONFIG_DM_ZERO=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y CONFIG_DM_VERITY_FEC=y +CONFIG_DM_VERITY_AVB=y CONFIG_DM_ANDROID_VERITY=y CONFIG_NETDEVICES=y CONFIG_NETCONSOLE=y @@ -425,6 +427,7 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y +# CONFIG_EFIVAR_FS is not set CONFIG_SDCARD_FS=y CONFIG_PSTORE=y CONFIG_PSTORE_CONSOLE=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cb5b3ab5beec..649f7d604b12 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -212,7 +212,6 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y # CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set @@ -242,6 +241,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig index 6c983f2262d3..5ae5bc1cee69 100644 --- a/arch/x86/configs/x86_64_ranchu_defconfig +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -157,7 +157,6 @@ CONFIG_NETFILTER_XT_MATCH_MAC=y CONFIG_NETFILTER_XT_MATCH_MARK=y CONFIG_NETFILTER_XT_MATCH_POLICY=y CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_QTAGUID=y CONFIG_NETFILTER_XT_MATCH_QUOTA=y CONFIG_NETFILTER_XT_MATCH_QUOTA2=y CONFIG_NETFILTER_XT_MATCH_SOCKET=y diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 49a8c9f7a379..217b60246cbb 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -331,7 +331,8 @@ sysenter_past_esp: * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). */ - ENABLE_INTERRUPTS_SYSEXIT + sti + sysexit .pushsection .fixup, "ax" 2: movl $0, PT_FS(%esp) @@ -554,11 +555,6 @@ ENTRY(native_iret) iret _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) #endif ENTRY(overflow) @@ -1071,6 +1067,7 @@ ENTRY(int3) END(int3) ENTRY(general_protection) + ASM_CLAC pushl $do_general_protection jmp error_code END(general_protection) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 30830cd6bd3e..0a86120c1016 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -57,7 +57,7 @@ ENDPROC(native_usergs_sysret64) .macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9, EFLAGS(%rsp) /* interrupts off? */ + btl $9, EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -225,6 +225,8 @@ entry_SYSCALL_64_fastpath: testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + call mds_user_clear_buffers + movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e62f4401e792..a63934de4e58 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -383,3 +383,5 @@ 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 +424 i386 pidfd_send_signal sys_pidfd_send_signal +434 i386 pidfd_open sys_pidfd_open diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index aa51cdd2a11a..e0e52d5fcda8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -332,6 +332,8 @@ 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 +424 common pidfd_send_signal sys_pidfd_send_signal +434 common pidfd_open sys_pidfd_open # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 3f9d1a83891a..50c1f77cab15 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -10,6 +10,7 @@ #include <linux/smp.h> #include <linux/kernel.h> #include <linux/mm_types.h> +#include <linux/elf.h> #include <asm/processor.h> #include <asm/vdso.h> diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 94c18ebfd68c..fd51f638e4ab 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -92,7 +92,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) if (boot_cpu_data.x86 == 0x0F && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86_model <= 0x05 && - boot_cpu_data.x86_mask < 0x0A) + boot_cpu_data.x86_stepping < 0x0A) return 1; else if (amd_e400_c1e_detected) return 1; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3328a37ddc75..34f11bc42d9b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -168,16 +168,6 @@ static inline void disable_local_APIC(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC -/* - * Make previous memory operations globally visible before - * sending the IPI through x2apic wrmsr. We need a serializing instruction or - * mfence for this. - */ -static inline void x2apic_wrmsr_fence(void) -{ - asm volatile("mfence" : : : "memory"); -} - static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 3d1ec41ae09a..20370c6db74b 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -6,8 +6,6 @@ #ifndef _ASM_X86_MACH_DEFAULT_APM_H #define _ASM_X86_MACH_DEFAULT_APM_H -#include <asm/nospec-branch.h> - #ifdef APM_ZERO_SEGS # define APM_DO_ZERO_SEGS \ "pushl %%ds\n\t" \ @@ -33,7 +31,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -46,7 +43,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, "=S" (*esi) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); } static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, @@ -59,7 +55,6 @@ static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -72,7 +67,6 @@ static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, "=S" (si) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); return error; } diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 249fa6b27557..47cb64dd319a 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -77,7 +77,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) */ static __always_inline int atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } /** @@ -114,7 +114,7 @@ static __always_inline void atomic_dec(atomic_t *v) */ static __always_inline int atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } /** @@ -127,7 +127,7 @@ static __always_inline int atomic_dec_and_test(atomic_t *v) */ static __always_inline int atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } /** @@ -141,7 +141,7 @@ static __always_inline int atomic_inc_and_test(atomic_t *v) */ static __always_inline int atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } /** @@ -220,19 +220,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static __always_inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; -} - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 377fa50cc271..fbb9a82599ab 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -72,7 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) */ static inline int atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } /** @@ -111,7 +111,7 @@ static __always_inline void atomic64_dec(atomic64_t *v) */ static inline int atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } /** @@ -124,7 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v) */ static inline int atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } /** @@ -138,7 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v) */ static inline int atomic64_add_negative(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index b2a5bef74282..134d7ffc662e 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -119,4 +119,22 @@ do { \ #define smp_mb__before_atomic() do { } while (0) #define smp_mb__after_atomic() do { } while (0) +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + asm volatile("mfence; lfence" : : : "memory"); +} + #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index cfe3b954d5e4..390e323a4de9 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -77,7 +77,7 @@ set_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)CONST_MASK(nr)) : "memory"); } else { - asm volatile(LOCK_PREFIX "bts %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); } } @@ -93,7 +93,7 @@ set_bit(long nr, volatile unsigned long *addr) */ static inline void __set_bit(long nr, volatile unsigned long *addr) { - asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); + asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); } /** @@ -114,7 +114,7 @@ clear_bit(long nr, volatile unsigned long *addr) : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)~CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX "btr %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr)); } @@ -136,7 +136,7 @@ static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) static inline void __clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); } /* @@ -168,7 +168,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) */ static inline void __change_bit(long nr, volatile unsigned long *addr) { - asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr)); } /** @@ -187,7 +187,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX "btc %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr)); } @@ -203,7 +203,8 @@ static inline void change_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), + *addr, "Ir", nr, "%0", c); } /** @@ -232,7 +233,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm("bts %2,%1\n\t" + asm(__ASM_SIZE(bts) " %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr)); @@ -249,7 +250,8 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), + *addr, "Ir", nr, "%0", c); } /** @@ -272,7 +274,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile(__ASM_SIZE(btr) " %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr)); @@ -284,7 +286,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile(__ASM_SIZE(btc) " %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -302,7 +304,8 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) */ static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), + *addr, "Ir", nr, "%0", c); } static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) @@ -315,7 +318,7 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile(__ASM_SIZE(bt) " %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index ff501e511d91..b9473858c6b6 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -8,6 +8,33 @@ #include <linux/mod_devicetable.h> +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) + +/** + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * Backport version to keep the SRBDS pile consistant. No shorter variants + * required for this. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ + .vendor = X86_VENDOR_##_vendor, \ + .family = _family, \ + .model = _model, \ + .steppings = _steppings, \ + .feature = _feature, \ + .driver_data = (unsigned long) _data \ +} + extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 94491e4d21a7..7c7fc5006017 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -270,6 +270,7 @@ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ @@ -301,16 +302,17 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ - /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ @@ -342,5 +344,6 @@ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index f498411f2500..1b15304dd098 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_CRASH_H #define _ASM_X86_CRASH_H +struct kimage; + int crash_load_segments(struct kimage *image); int crash_copy_backup_region(struct kimage *image); int crash_setup_memmap_entries(struct kimage *image, diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index fe884e18fa6e..c7854a098b6b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -73,7 +73,7 @@ #define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT)) #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index f80d70009ff8..d0e39f54feee 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -147,7 +147,7 @@ extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); -void native_set_fixmap(enum fixed_addresses idx, +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 66a5e60f60c4..4fb38927128c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -217,6 +217,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) } } +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" @@ -290,28 +298,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (static_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - else - XSTATE_OP(XSAVE, xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) { u64 mask = -1; diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 5a51fcbbe563..6db02d52cdf4 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -198,6 +198,21 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +/** + * for_each_insn_prefix() -- Iterate prefixes in the instruction + * @insn: Pointer to struct insn. + * @idx: Index storage. + * @prefix: Prefix byte. + * + * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix + * and the index is stored in @idx (note that this @idx is just for a cursor, + * do not change it.) + * Since prefixes.nbytes can be bigger than 4 if some prefixes + * are repeated, it cannot be used for looping over the prefixes. + */ +#define for_each_insn_prefix(insn, idx, prefix) \ + for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) + #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 4ad6560847b1..53238f0da79e 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -52,7 +52,7 @@ static inline void local_sub(long i, local_t *l) */ static inline int local_sub_and_test(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e); } /** @@ -65,7 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l) */ static inline int local_dec_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e); } /** @@ -78,7 +78,7 @@ static inline int local_dec_and_test(local_t *l) */ static inline int local_inc_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e); } /** @@ -92,7 +92,7 @@ static inline int local_inc_and_test(local_t *l) */ static inline int local_add_negative(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 90343ba50485..92ce9c8a508b 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -60,7 +60,7 @@ static inline u32 intel_get_microcode_revision(void) native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); + native_cpuid_eax(1); /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 854a20efa771..4ee65ec8c29c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -91,6 +91,10 @@ #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ #define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ +/* SRBDS support */ +#define MSR_IA32_MCU_OPT_CTRL 0x00000123 +#define RNGDS_MITG_DIS BIT(0) + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 783f0711895b..2f84887e8934 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -178,6 +178,7 @@ enum spectre_v2_mitigation { enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, + SPECTRE_V2_USER_STRICT_PREFERRED, SPECTRE_V2_USER_PRCTL, SPECTRE_V2_USER_SECCOMP, }; @@ -274,7 +275,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ -static inline void mds_clear_cpu_buffers(void) +static __always_inline void mds_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; @@ -295,7 +296,7 @@ static inline void mds_clear_cpu_buffers(void) * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_user_clear_cpu_buffers(void) +static __always_inline void mds_user_clear_cpu_buffers(void) { if (static_branch_likely(&mds_user_clear)) mds_clear_cpu_buffers(); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index fb1251946b45..67a140d77f33 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -15,7 +15,7 @@ #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) -#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) +#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c759b3cca663..b4c5099cafee 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -938,13 +938,6 @@ extern void default_banner(void); push %ecx; push %edx; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx - -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) - - #else /* !CONFIG_X86_32 */ /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 3d44191185f8..cc0e5a666c9e 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -162,15 +162,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); -#ifdef CONFIG_X86_32 - /* - * Atomically enable interrupts and return to userspace. This - * is only used in 32-bit kernels. 64-bit kernels use - * usergs_sysret32 instead. - */ - void (*irq_enable_sysexit)(void); -#endif - /* * Switch to usermode gs and return to 64-bit usermode using * sysret. Only used in 64-bit kernels to return to 64-bit diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index f5e780bfa2b3..66cd0c862a80 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -534,7 +534,7 @@ static inline int x86_this_cpu_variable_test_bit(int nr, { int oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("btl "__percpu_arg(2)",%1\n\t" "sbb %0,%0" : "=r" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 01bcde84d3e4..ad6661ca315d 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -81,7 +81,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); + GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cac54e61c299..55163bc810db 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -88,7 +88,7 @@ struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; - __u8 x86_mask; + __u8 x86_stepping; #ifdef CONFIG_X86_32 char wp_works_ok; /* It doesn't on 386's */ @@ -212,6 +212,24 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, : "memory"); } +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a4a77286cb1d..ae6f1592530b 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -3,6 +3,8 @@ #include <asm/ldt.h> +struct task_struct; + /* misc architecture specific prototypes */ void syscall_init(void); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8f7866a5b9a4..cb0dce0273c8 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -5,7 +5,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ @@ -24,7 +24,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ char c; \ - asm volatile (fullop "; set" cc " %1" \ + asm volatile (fullop "; set" #cc " %1" \ : "+m" (var), "=qm" (c) \ : __VA_ARGS__ : "memory"); \ return c != 0; \ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 5393babc0598..4a7acb4adc6b 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -85,4 +85,6 @@ static inline void speculative_store_bypass_ht_init(void) { } extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); +extern void mds_user_clear_buffers(void); + #endif diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 58505f01962f..743bd2d77e51 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -54,8 +54,13 @@ /* * Initialize the stackprotector canary value. * - * NOTE: this must only be called from functions that never return, + * NOTE: this must only be called from functions that never return * and it must always be inlined. + * + * In addition, it should be called from a compilation unit for which + * stack protector is disabled. Alternatively, the caller should not end + * with a function call which gets tail-call optimized as that would + * lead to checking a modified canary value. */ static __always_inline void boot_init_stack_canary(void) { diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 70bbe39043a9..7c247e7404be 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo, /* Generic stack tracer with callbacks */ struct stacktrace_ops { - void (*address)(void *data, unsigned long address, int reliable); + int (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); walk_stack_t walk_stack; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6136d99f537b..c1adb2ed6d41 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -108,6 +108,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_IGN_TPR_SHIFT 20 #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) +#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK) + #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8dab88b85785..33a594f728de 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -245,12 +245,15 @@ static inline void __native_flush_tlb_single(unsigned long addr) * ASID. But, userspace flushes are probably much more * important performance-wise. * - * Make sure to do only a single invpcid when KAISER is - * disabled and we have only a single ASID. + * In the KAISER disabled case, do an INVLPG to make sure + * the mapping is flushed in case it is a global one. */ - if (kaiser_enabled) + if (kaiser_enabled) { invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); - invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + } else { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + } } static inline void __flush_tlb_all(void) diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index f556c4843aa1..ef342818fcf1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -83,8 +83,13 @@ static inline unsigned int __getcpu(void) * works on all CPUs. This is volatile so that it orders * correctly wrt barrier() and to keep gcc from cleverly * hoisting it out of the calling function. + * + * If RDPID is available, use it. */ - asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + alternative_io ("lsl %[seg],%[p]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); return p; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a1e4a6c3f394..2222f39a6170 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1708,7 +1708,7 @@ int __acpi_acquire_global_lock(unsigned int *lock) new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); val = cmpxchg(lock, old, new); } while (unlikely (val != old)); - return (new < 3) ? -1 : 0; + return ((new & 0x3) < 3) ? -1 : 0; } int __acpi_release_global_lock(unsigned int *lock) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c986d0b3bc35..df9ee8d768bf 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -105,7 +105,7 @@ int amd_cache_northbridges(void) if (boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model >= 0x8 && (boot_cpu_data.x86_model > 0x9 || - boot_cpu_data.x86_mask >= 0x1)) + boot_cpu_data.x86_stepping >= 0x1)) amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; if (boot_cpu_data.x86 == 0x15) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index be3d4dcf3a10..f53849f3f7fb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -41,6 +41,7 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> @@ -464,6 +465,9 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; @@ -1298,16 +1302,21 @@ void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - logical_smp_processor_id(); + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d5e8ff3b5e5..497ad354e123 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1040,6 +1040,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); @@ -1710,9 +1720,10 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) static inline bool ioapic_irqd_mask(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; @@ -1749,7 +1760,9 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cc8311c4d298..f474756fc151 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -32,7 +32,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long flags; u32 dest; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 662e9150ea6f..ad7c3544b07f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -43,7 +43,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long this_cpu; unsigned long flags; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 4a139465f1d4..7554075414d4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -648,9 +648,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) l = li; } addr1 = (base << shift) + - f * (unsigned long)(1 << m_io); + f * (1ULL << m_io); addr2 = (base << shift) + - (l + 1) * (unsigned long)(1 << m_io); + (l + 1) * (1ULL << m_io); pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 052c9c3026cc..dfdbe01ef9f2 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -239,6 +239,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/nospec-branch.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -613,11 +614,13 @@ static long __apm_bios_call(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); @@ -689,10 +692,12 @@ static long __apm_bios_call_simple(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 439df975bc7a..84a7524b202c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -65,9 +65,6 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); -#ifdef CONFIG_X86_32 - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); -#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index fdeb0ce07c16..65337adcc618 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -20,7 +20,7 @@ void foo(void) OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 424d8a636615..b8fbe983277b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -112,7 +112,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if (c->x86_model == 6 && c->x86_mask == 1) { + if (c->x86_model == 6 && c->x86_stepping == 1) { const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); @@ -142,7 +142,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) /* K6 with old style WHCR */ if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { + (c->x86_model == 8 && c->x86_stepping < 8)) { /* We can only write allocate on the low 508Mb */ if (mbytes > 508) mbytes = 508; @@ -161,7 +161,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if ((c->x86_model == 8 && c->x86_mask > 7) || + if ((c->x86_model == 8 && c->x86_stepping > 7) || c->x86_model == 9 || c->x86_model == 13) { /* The more serious chips .. */ @@ -214,7 +214,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx * As per AMD technical note 27212 0.2 */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { printk(KERN_INFO @@ -235,12 +235,12 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * but they are not certified as MP capable. */ /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) + if ((c->x86_model == 6) && ((c->x86_stepping == 0) || + (c->x86_stepping == 1))) return; /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) + if ((c->x86_model == 7) && (c->x86_stepping == 0)) return; /* @@ -250,8 +250,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for * more. */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || + if (((c->x86_model == 6) && (c->x86_stepping >= 2)) || + ((c->x86_model == 7) && (c->x86_stepping >= 1)) || (c->x86_model > 7)) if (cpu_has(c, X86_FEATURE_MP)) return; @@ -563,7 +563,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* Set MTRR capability flag if appropriate */ if (c->x86 == 5) if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) + (c->x86_model == 8 && c->x86_stepping >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) @@ -902,11 +902,11 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { /* Duron Rev A0 */ - if (c->x86_model == 3 && c->x86_mask == 0) + if (c->x86_model == 3 && c->x86_stepping == 0) size = 64; /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) + (c->x86_stepping == 0 || c->x86_stepping == 1)) size = 256; } return size; @@ -1043,7 +1043,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) } /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_mask; + ms = (cpu->x86_model << 4) | cpu->x86_stepping; while ((range = *erratum++)) if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e9aa50ba4f97..ffc3bc8111b8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -58,7 +59,7 @@ static u64 x86_spec_ctrl_mask = SPEC_CTRL_IBRS; u64 x86_amd_ls_cfg_base; u64 x86_amd_ls_cfg_ssbd_mask; -/* Control conditional STIPB in switch_to() */ +/* Control conditional STIBP in switch_to() */ DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); /* Control conditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); @@ -99,6 +100,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -263,6 +265,11 @@ static int __init mds_cmdline(char *str) } early_param("mds", mds_cmdline); +void mds_user_clear_buffers(void) +{ + mds_user_clear_cpu_buffers(); +} + #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt @@ -364,6 +371,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -460,7 +558,8 @@ early_param("nospectre_v1", nospectre_v1_cmdline); static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; -static enum spectre_v2_user_mitigation spectre_v2_user = SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -511,10 +610,11 @@ enum spectre_v2_user_cmd { }; static const char * const spectre_v2_user_strings[] = { - [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", - [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", - [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", - [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", + [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; static const struct { @@ -607,11 +707,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + spectre_v2_user_ibpb = mode; switch (cmd) { case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_AUTO: @@ -627,21 +729,32 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) "always-on" : "conditional"); } - /* If enhanced IBRS is enabled no STIPB required */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * required. + */ + if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* - * If SMT is not possible or STIBP is not available clear the STIPB - * mode. + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. */ - if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + /* + * If STIBP is not available, clear the STIBP mode. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP)) mode = SPECTRE_V2_USER_NONE; + + spectre_v2_user_stibp = mode; + set_mode: - spectre_v2_user = mode; - /* Only print the STIBP mode when SMT possible */ - if (smt_possible) - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { @@ -881,10 +994,11 @@ void arch_smt_update(void) { mutex_lock(&spec_ctrl_mutex); - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: @@ -1109,18 +1223,41 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; + /* - * Indirect branch speculation is always disabled in strict - * mode. + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ - if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + if (!is_spec_ib_user_controlled() || + task_spec_ib_force_disable(task)) return -EPERM; + task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; @@ -1130,10 +1267,13 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always allowed when * mitigation is force disabled. */ - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + + if (!is_spec_ib_user_controlled()) return 0; + task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); @@ -1163,7 +1303,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); - if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif @@ -1192,21 +1333,21 @@ static int ib_prctl_get(struct task_struct *task) if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case SPECTRE_V2_USER_STRICT: + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return PR_SPEC_DISABLE; - default: + else return PR_SPEC_NOT_AFFECTED; - } } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) @@ -1347,11 +1488,13 @@ static char *stibp_state(void) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return ""; - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; + case SPECTRE_V2_USER_STRICT_PREFERRED: + return ", STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) @@ -1372,6 +1515,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1413,6 +1561,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1459,4 +1610,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 6608c03c2126..cf761e640797 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -134,7 +134,7 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_TSC); break; case 8: - switch (c->x86_mask) { + switch (c->x86_stepping) { default: name = "2"; break; @@ -209,7 +209,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) * - Note, it seems this may only be in engineering samples. */ if ((c->x86 == 6) && (c->x86_model == 9) && - (c->x86_mask == 1) && (size == 65)) + (c->x86_stepping == 1) && (size == 65)) size -= 1; return size; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e8fa12c7ad5b..32567a5bb8d3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -652,7 +652,7 @@ void cpu_detect(struct cpuinfo_x86 *c) cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; + c->x86_stepping = tfms & 0xf; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; @@ -912,9 +912,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -934,29 +955,32 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -974,7 +998,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -983,7 +1016,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1157,7 +1190,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = 0; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; @@ -1327,6 +1360,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #endif mtrr_ap_init(); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } struct msr_range { @@ -1403,8 +1437,8 @@ void print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + pr_cont(", stepping: 0x%x)\n", c->x86_stepping); else printk(KERN_CONT ")\n"); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c42cc1acd668..f2eca5632f7d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -64,6 +64,7 @@ extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 151625a83d9e..bc90e879998c 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -212,7 +212,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* common case step number/rev -- exceptions handled below */ c->x86_model = (dir1 >> 4) + 1; - c->x86_mask = dir1 & 0xf; + c->x86_stepping = dir1 & 0xf; /* Now cook; the original recipe is by Channing Corn, from Cyrix. * We do the same thing for each generation: we work out diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7beef3da5904..cb73d16d540c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -80,7 +80,7 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { if (c->x86_model == spectre_bad_microcodes[i].model && - c->x86_mask == spectre_bad_microcodes[i].stepping) + c->x86_stepping == spectre_bad_microcodes[i].stepping) return (c->microcode <= spectre_bad_microcodes[i].microcode); } return false; @@ -130,7 +130,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && c->microcode < 0x20e) { printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -146,7 +146,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* CPUID workaround for 0F33/0F34 CPU */ if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -246,8 +246,8 @@ int ppro_with_ram_bug(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask < 8) { - printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + boot_cpu_data.x86_stepping < 8) { + pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; } return 0; @@ -263,7 +263,7 @@ static void intel_smp_check(struct cpuinfo_x86 *c) * Mask B, Pentium, but not Pentium MMX */ if (c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_stepping >= 1 && c->x86_stepping <= 4 && c->x86_model <= 3) { /* * Remember we have B step Pentia with bugs @@ -306,7 +306,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -324,7 +324,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { @@ -340,7 +340,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * Specification Update"). */ if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); @@ -555,7 +555,7 @@ static void init_intel(struct cpuinfo_x86 *c) case 6: if (l2 == 128) p = "Celeron (Mendocino)"; - else if (c->x86_mask == 0 || c->x86_mask == 5) + else if (c->x86_stepping == 0 || c->x86_stepping == 5) p = "Celeron-A"; break; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index fbb5e90557a5..a207aaaf78db 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -33,13 +33,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b9cfdcc3aaa..605395bbf0d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1536,11 +1536,10 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) mce_flags.overflow_recov = 1; /* - * Turn off MC4_MISC thresholding banks on those models since + * Turn off MC4_MISC thresholding banks on all models since * they're not supported there. */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + if (c->x86 == 0x15) { int i; u64 hwcr; bool need_toggle; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2116176c1721..37f5c2608844 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -560,9 +560,12 @@ static const struct sysfs_ops threshold_ops = { .store = store, }; +static void threshold_block_release(struct kobject *kobj); + static struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, .default_attrs = default_attrs, + .release = threshold_block_release, }; static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, @@ -765,8 +768,12 @@ static int threshold_create_device(unsigned int cpu) return err; } -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) +static void threshold_block_release(struct kobject *kobj) +{ + kfree(to_block(kobj)); +} + +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; struct threshold_block *tmp = NULL; @@ -776,13 +783,11 @@ static void deallocate_threshold_block(unsigned int cpu, return; list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); list_del(&pos->miscj); - kfree(pos); + kobject_put(&pos->kobj); } - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; + kobject_put(&head->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index afaf648386e9..d4c3a30a7b33 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -132,51 +132,6 @@ load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, } } -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - fam = __x86_family(sig); - model = x86_model(sig); - - fam_ucode = __x86_family(mc_header->sig); - model_ucode = x86_model(mc_header->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; - - ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - ext_sigcount = ext_header->count; - - for (i = 0; i < ext_sigcount; i++) { - fam_ucode = __x86_family(ext_sig->sig); - model_ucode = x86_model(ext_sig->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - ext_sig++; - } - return UCODE_NFOUND; -} - static int save_microcode(struct mc_saved_data *mc_saved_data, struct microcode_intel **mc_saved_src, @@ -321,8 +276,8 @@ get_matching_model_microcode(int cpu, unsigned long start, * the platform, we need to find and save microcode patches * with the same family and model as the BSP. */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { + if (!find_matching_signature(mc_header, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { ucode_ptr += mc_size; continue; } @@ -1013,7 +968,7 @@ static bool is_blacklisted(unsigned int cpu) */ if (c->x86 == 6 && c->x86_model == 79 && - c->x86_mask == 0x01 && + c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); @@ -1036,7 +991,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; sprintf(name, "intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); + c->x86, c->x86_model, c->x86_stepping); if (request_firmware_direct(&firmware, name, device)) { pr_debug("data file %s load failed\n", name); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 136ae86f4f5f..e2fa0fcbaa69 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -166,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, *repeat = 0; *uniform = 1; - /* Make end inclusive instead of exclusive */ - end--; - prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -260,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) int repeat; u64 partial_end; + /* Make end inclusive instead of exclusive */ + end--; + if (!mtrr_state_set) return MTRR_TYPE_INVALID; @@ -860,7 +860,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask <= 7) { + boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 49bd700d9b7f..be3050f23536 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -699,8 +699,8 @@ void __init mtrr_bp_init(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_mask == 0x3 || - boot_cpu_data.x86_mask == 0x4)) + (boot_cpu_data.x86_stepping == 0x3 || + boot_cpu_data.x86_stepping == 0x4)) phys_addr = 36; size_or_mask = SIZE_OR_MASK_BITS(phys_addr); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fbf2edc3eb35..3e71f89be1c2 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1550,6 +1550,7 @@ static void __init filter_events(struct attribute **attrs) { struct device_attribute *d; struct perf_pmu_events_attr *pmu_attr; + int offset = 0; int i, j; for (i = 0; attrs[i]; i++) { @@ -1558,7 +1559,7 @@ static void __init filter_events(struct attribute **attrs) /* str trumps id */ if (pmu_attr->event_str) continue; - if (x86_pmu.event_map(i)) + if (x86_pmu.event_map(i + offset)) continue; for (j = i; attrs[j]; j++) @@ -1566,6 +1567,14 @@ static void __init filter_events(struct attribute **attrs) /* Check the shifted attr. */ i--; + + /* + * event_map() is index based, the attrs array is organized + * by increasing event index. If we shift the events, then + * we need to compensate for the event_map(), otherwise + * we are looking up the wrong event in the map + */ + offset++; } } @@ -1992,6 +2001,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) @@ -2187,11 +2197,11 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - perf_callchain_store(entry, addr); + return perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 97242a9242bd..ec0bfbab7265 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -80,12 +80,12 @@ static struct attribute_group amd_iommu_format_group = { * sysfs events attributes *---------------------------------------------*/ struct amd_iommu_event_desc { - struct kobj_attribute attr; + struct device_attribute attr; const char *event; }; -static ssize_t _iommu_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t _iommu_event_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct amd_iommu_event_desc *event = container_of(attr, struct amd_iommu_event_desc, attr); diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 49742746a6c9..98e786a779fd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -181,21 +181,19 @@ static int amd_uncore_event_init(struct perf_event *event) return -ENOENT; /* - * NB and L2 counters (MSRs) are shared across all cores that share the - * same NB / L2 cache. Interrupts can be directed to a single target - * core, however, event counts generated by processes running on other - * cores cannot be masked out. So we do not support sampling and - * per-thread events. + * NB and Last level cache counters (MSRs) are shared across all cores + * that share the same NB / Last level cache. On family 16h and below, + * Interrupts can be directed to a single target core, however, event + * counts generated by processes running on other cores cannot be masked + * out. So we do not support sampling and per-thread events via + * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: */ - if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) - return -EINVAL; /* NB and L2 counters do not have usr/os/guest/host bits */ if (event->attr.exclude_user || event->attr.exclude_kernel || event->attr.exclude_host || event->attr.exclude_guest) return -EINVAL; - /* and we do not enable counter overflow interrupts */ hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; @@ -271,6 +269,7 @@ static struct pmu amd_nb_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, }; static struct pmu amd_l2_pmu = { @@ -282,6 +281,7 @@ static struct pmu amd_l2_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3572434a73cb..d973c079e97c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1937,7 +1937,8 @@ intel_bts_constraints(struct perf_event *event) static int intel_alt_er(int idx, u64 config) { - int alt_idx; + int alt_idx = idx; + if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; @@ -3051,7 +3052,7 @@ static int intel_snb_pebs_broken(int cpu) break; case 45: /* SNB-EP */ - switch (cpu_data(cpu).x86_mask) { + switch (cpu_data(cpu).x86_stepping) { case 6: rev = 0x618; break; case 7: rev = 0x70c; break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 2cdae69d7e0b..09058ad9816c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -1017,7 +1017,7 @@ void __init intel_pmu_lbr_init_atom(void) * on PMU interrupt */ if (boot_cpu_data.x86_model == 28 - && boot_cpu_data.x86_mask < 10) { + && boot_cpu_data.x86_stepping < 10) { pr_cont("LBR disabled due to erratum"); return; } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 7c1a0c07b607..507e2e319f52 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -233,7 +233,7 @@ static __initconst const struct x86_pmu p6_pmu = { static __init void p6_pmu_rdpmc_quirk(void) { - if (boot_cpu_data.x86_mask < 9) { + if (boot_cpu_data.x86_stepping < 9) { /* * PPro erratum 26; fixed in stepping 9 and above. */ diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 935225c0375f..c4f772d3f35c 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -70,8 +70,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_stepping); else seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index c2a9dd816c5c..9a7983968ba8 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -115,11 +115,12 @@ void __init tsx_init(void) tsx_disable(); /* - * tsx_disable() will change the state of the - * RTM CPUID bit. Clear it here since it is now - * expected to be not set. + * tsx_disable() will change the state of the RTM and HLE CPUID + * bits. Clear them here since they are now expected to be not + * set. */ setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { /* @@ -131,10 +132,10 @@ void __init tsx_init(void) tsx_enable(); /* - * tsx_enable() will change the state of the - * RTM CPUID bit. Force it here since it is now - * expected to be set. + * tsx_enable() will change the state of the RTM and HLE CPUID + * bits. Force them here since they are now expected to be set. */ setup_force_cpu_cap(X86_FEATURE_RTM); + setup_force_cpu_cap(X86_FEATURE_HLE); } } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2c1910f6717e..a6d623e43d62 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/overflow.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -572,7 +573,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9c30acfadae2..0d1ff4b407d4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo, if (!__kernel_text_address(addr)) break; - ops->address(data, addr, 1); + if (ops->address(data, addr, 1)) + break; frame = frame->next_frame; ret_addr = &frame->return_address; print_ftrace_graph_addr(addr, data, ops, tinfo, graph); @@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr, int reliable) +static int print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); + return 0; } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31fad2cbd734..9a1489b92782 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -262,15 +262,23 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; } - if (!access_ok(VERIFY_READ, buf, size)) + if (!access_ok(VERIFY_READ, buf, size)) { + fpu__clear(fpu); return -EACCES; + } fpu__activate_curr(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + if (!static_cpu_has(X86_FEATURE_FPU)) { + int ret = fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + + if (ret) + fpu__clear(fpu); + + return ret != 0; + } if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -317,10 +325,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } + local_bh_disable(); fpu->fpstate_active = 1; - preempt_disable(); fpu__restore(fpu); - preempt_enable(); + local_bh_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3fa200ecca62..1ff1adbc843b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -293,12 +293,31 @@ static void __init setup_xstate_comp(void) } /* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + +/* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { static int on_boot_cpu = 1; + BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED); + WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -319,10 +338,22 @@ static void __init setup_init_fpu_buf(void) copy_kernel_to_xregs_booting(&init_fpstate.xsave); /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVES because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVES is available because XSAVES uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. */ - copy_xregs_to_kernel_booting(&init_fpstate.xsave); + fxsave(&init_fpstate.fxsave); } static int xfeature_is_supervisor(int xfeature_nr) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1c0b49fd6365..10139b8f1e53 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -34,7 +34,7 @@ #define X86 new_cpu_data+CPUINFO_x86 #define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor #define X86_MODEL new_cpu_data+CPUINFO_x86_model -#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping #define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math #define X86_CPUID new_cpu_data+CPUINFO_cpuid_level #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability @@ -440,7 +440,7 @@ enable_paging: shrb $4,%al movb %al,X86_MODEL andb $0x0f,%cl # mask mask revision - movb %cl,X86_MASK + movb %cl,X86_STEPPING movl %edx,X86_CAPABILITY is486: diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4e3b8a587c88..01a1ab8483ac 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -204,7 +204,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..2a53a63f1e70 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -283,8 +283,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; - else + else { kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 0bf17576dd2a..299e7fb55f16 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -212,8 +212,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; /* Copying screen_info will do? */ - memcpy(¶ms->screen_info, &boot_params.screen_info, - sizeof(struct screen_info)); + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); /* Fill in memsize later */ params->screen_info.ext_mem_k = 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ed15cd486d06..7c759ef6ddfb 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -437,7 +437,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) */ void kgdb_roundup_cpus(unsigned long flags) { - apic->send_IPI_allbutself(APIC_DM_NMI); + apic->send_IPI_allbutself(NMI_VECTOR); } #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 01fda02e1df6..05ab9c8fd7a2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -751,6 +751,13 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) void *frame_pointer; bool skipped = false; + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kprobe_busy_begin(); + INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ @@ -826,10 +833,9 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kprobe_busy); } recycle_rp_inst(ri, &empty_rp); @@ -845,6 +851,8 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + kprobe_busy_end(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); @@ -1007,6 +1015,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * So clear it by resetting the current kprobe: */ regs->flags &= ~X86_EFLAGS_TF; + /* + * Since the single step (trap) has been cancelled, + * we need to restore BTF here. + */ + restore_btf(); /* * If the TF flag was set before the kprobe hit, diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 94779f66bf49..6f0d340594ca 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -124,6 +124,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: + case R_386_PLT32: /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 30ca7607cbbb..c1cccfa900de 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -407,7 +407,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; processor.cpuflag = CPU_ENABLED; processor.cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping; processor.featureflag = boot_cpu_data.x86_capability[0]; processor.reserved[0] = 0; processor.reserved[1] = 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 632195b41688..2cd05f34c0b6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -168,9 +168,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || -#ifdef CONFIG_X86_32 - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || -#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -226,7 +223,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); @@ -385,9 +381,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) - .irq_enable_sysexit = native_irq_enable_sysexit, -#endif #ifdef CONFIG_X86_64 #ifdef CONFIG_IA32_EMULATION .usergs_sysret32 = native_usergs_sysret32, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index c89f50a76e97..158dc0650d5d 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); @@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 0677bf8d3a42..03c6a8cf33c4 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -12,7 +12,6 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f69bae6b29e1..4112e76c0a7e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -319,28 +319,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; - /* - * If TIF_SSBD is different, select the proper mitigation - * method. Note that if SSBD mitigation is disabled or permanentely - * enabled this branch can't be taken because nothing can set - * TIF_SSBD. - */ - if (tif_diff & _TIF_SSBD) { - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); - } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); - } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - msr |= ssbd_tif_to_spec_ctrl(tifn); - updmsr = true; - } + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); } - /* - * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, - * otherwise avoid the MSR write. - */ + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 898e97cf6629..320ab978fb1f 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -19,7 +19,7 @@ static inline void switch_to_extra(struct task_struct *prev, if (IS_ENABLED(CONFIG_SMP)) { /* * Avoid __switch_to_xtra() invocation when conditional - * STIPB is disabled and the only different bit is + * STIBP is disabled and the only different bit is * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not * in the TIF_WORK_CTXSW masks. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 618565fecb1c..1a79d451cd34 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -128,7 +128,7 @@ void release_thread(struct task_struct *dead_task) if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, - dead_task->mm->context.ldt, + dead_task->mm->context.ldt->entries, dead_task->mm->context.ldt->size); BUG(); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9a16932c7258..143c06f84596 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -162,6 +162,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, .ident = "Apple MacBookPro5", @@ -329,10 +337,11 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell OptiPlex 990 BIOS A0x", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), }, }, { /* Handle problems with rebooting on Dell 300's */ @@ -410,6 +419,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, @@ -471,29 +489,20 @@ static void emergency_vmx_disable_all(void) local_irq_disable(); /* - * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignore INIT - * signals when VMX is enabled. - * - * We can't take any locks and we may be on an inconsistent - * state, so we use NMIs as IPIs to tell the other CPUs to disable - * VMX and halt. + * Disable VMX on all CPUs before rebooting, otherwise we risk hanging + * the machine, because the CPU blocks INIT when it's in VMX root. * - * For safety, we will avoid running the nmi_shootdown_cpus() - * stuff unnecessarily, but we don't have a way to check - * if other CPUs have VMX enabled. So we will call it only if the - * CPU we are running on has VMX enabled. + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. * - * We will miss cases where VMX is not enabled on all CPUs. This - * shouldn't do much harm because KVM always enable VMX on all - * CPUs anyway. But we can miss it on the small window where KVM - * is still enabling VMX. + * Do the NMI shootdown even if VMX if off on _this_ CPU, as that + * doesn't prevent a different CPU from being in VMX root operation. */ - if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. */ - cpu_vmxoff(); + if (cpu_has_vmx()) { + /* Safely force _this_ CPU out of VMX root operation. */ + __cpu_emergency_vmxoff(); - /* Halt and disable VMX on the other CPUs */ + /* Halt and exit VMX root operation on the other CPUs. */ nmi_shootdown_cpus(vmxoff_nmi); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c017f1c71560..0512af683871 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -243,6 +243,14 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_ONLINE); + + /* + * Prevent tail call to cpu_startup_entry() because the stack protector + * guard has been changed a couple of function calls up, in + * boot_init_stack_canary() and must not be checked before tail calling + * another function. + */ + prevent_tail_call_optimization(); } void __init smp_store_boot_cpu_info(void) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c6430e5a..9ee98eefc44d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void +static int __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) - return; + return 0; #endif if (nosched && in_sched_functions(addr)) - return; + return 0; if (trace->skip > 0) { trace->skip--; - return; + return 0; } - if (trace->nr_entries < trace->max_entries) + if (trace->nr_entries < trace->max_entries) { trace->entries[trace->nr_entries++] = addr; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static int save_stack_address(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, false); } -static void +static int save_stack_address_nosched(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, true); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 590c8fd2ed9b..700b8e857025 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -22,10 +22,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; -#endif - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8c38784cf992..60ccfa4c2768 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -268,10 +268,11 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { + insn_byte_t p; int i; - for (i = 0; i < insn->prefixes.nbytes; i++) { - switch (insn->prefixes.bytes[i]) { + for_each_insn_prefix(insn, i, p) { + switch (p) { case 0x26: /* INAT_PFX_ES */ case 0x2E: /* INAT_PFX_CS */ case 0x36: /* INAT_PFX_DS */ @@ -711,6 +712,7 @@ static struct uprobe_xol_ops branch_xol_ops = { static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); + insn_byte_t p; int i; switch (opc1) { @@ -741,8 +743,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for (i = 0; i < insn->prefixes.nbytes; i++) { - if (insn->prefixes.bytes[i] == 0x66) + for_each_insn_prefix(insn, i, p) { + if (p == 0x66) return -ENOTSUPP; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 6c6bc7f4ba8a..238b7dcd5e93 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -34,13 +34,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) ENTRY(phys_startup_32) -jiffies = jiffies_64; #else OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) -jiffies_64 = jiffies; #endif +jiffies = jiffies_64; + #if defined(CONFIG_X86_64) /* * On 64-bit, align RODATA to 2MB so we retain large page mappings for @@ -348,7 +348,10 @@ SECTIONS /* Sections to be discarded */ DISCARDS - /DISCARD/ : { *(.eh_frame) } + /DISCARD/ : { + *(.eh_frame) + *(__func_stack_frame_non_standard) + } } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 53b7f53f6207..c88de752cabc 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -26,7 +26,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 40e415fedcee..b60ffd1b3ae2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -267,13 +267,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, { switch (func) { case 0: - entry->eax = 1; /* only one leaf currently */ + entry->eax = 7; ++*nent; break; case 1: entry->ecx = F(MOVBE); ++*nent; break; + case 7: + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + if (index == 0) + entry->ecx = F(RDPID); + ++*nent; default: break; } @@ -373,7 +378,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, r = -E2BIG; - if (*nent >= maxnent) + if (WARN_ON(*nent >= maxnent)) goto out; do_cpuid_1_ent(entry, function, index); @@ -452,7 +457,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx |= F(SPEC_CTRL); if (boot_cpu_has(X86_FEATURE_STIBP)) entry->edx |= F(INTEL_STIBP); - if (boot_cpu_has(X86_FEATURE_SSBD)) + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even @@ -605,8 +611,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); unsigned phys_as = entry->eax & 0xff; - if (!g_phys_as) + /* + * Use bare metal's MAXPHADDR if the CPU doesn't report guest + * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the + * guest version "applies only to guests using nested paging". + */ + if (!g_phys_as || !tdp_enabled) g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; /* @@ -669,6 +681,9 @@ out: static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, u32 idx, int *nent, int maxnent, unsigned int type) { + if (*nent >= maxnent) + return -E2BIG; + if (type == KVM_GET_EMULATED_CPUID) return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6c7847b3aa2d..827d54a5126e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -23,6 +23,7 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" #include <linux/module.h> +#include <linux/nospec.h> #include <asm/kvm_emulate.h> #include <linux/stringify.h> #include <asm/debugreg.h> @@ -3518,6 +3519,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_rdpid(struct x86_emulate_ctxt *ctxt) +{ + u64 tsc_aux = 0; + + if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + return emulate_ud(ctxt); + ctxt->dst.val = tsc_aux; + return X86EMUL_CONTINUE; +} + static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { u64 tsc = 0; @@ -3911,6 +3922,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_clflushopt(struct x86_emulate_ctxt *ctxt) +{ + /* emulating clflushopt regardless of cpuid */ + return X86EMUL_CONTINUE; +} + static int em_movsxd(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = (s32) ctxt->src.val; @@ -4378,10 +4395,20 @@ static const struct opcode group8[] = { F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; +/* + * The "memory" destination is actually always a register, since we come + * from the register case of group9. + */ +static const struct gprefix pfx_0f_c7_7 = { + N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), +}; + + static const struct group_dual group9 = { { N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { - N, N, N, N, N, N, N, N, + N, N, N, N, N, N, N, + GP(0, &pfx_0f_c7_7), } }; static const struct opcode group11[] = { @@ -4390,7 +4417,7 @@ static const struct opcode group11[] = { }; static const struct gprefix pfx_0f_ae_7 = { - I(SrcMem | ByteOp, em_clflush), N, N, N, + I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N, }; static const struct group_dual group15 = { { @@ -4989,6 +5016,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; + ctxt->intercept = x86_intercept_none; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); else { @@ -5041,16 +5069,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_ES; + break; case 0x2e: /* CS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_CS; + break; case 0x36: /* SS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_SS; + break; case 0x3e: /* DS override */ has_seg_override = true; - ctxt->seg_override = (ctxt->b >> 3) & 3; + ctxt->seg_override = VCPU_SREG_DS; break; case 0x64: /* FS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_FS; + break; case 0x65: /* GS override */ has_seg_override = true; - ctxt->seg_override = ctxt->b & 7; + ctxt->seg_override = VCPU_SREG_GS; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -5134,10 +5174,15 @@ done_prefixes: } break; case Escape: - if (ctxt->modrm > 0xbf) - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; - else + if (ctxt->modrm > 0xbf) { + size_t size = ARRAY_SIZE(opcode.u.esc->high); + u32 index = array_index_nospec( + ctxt->modrm - 0xc0, size); + + opcode = opcode.u.esc->high[index]; + } else { opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + } break; case InstrDual: if ((ctxt->modrm >> 6) == 3) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 62cf8c915e95..fce6fa012d30 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -26,6 +26,7 @@ #include "hyperv.h" #include <linux/kvm_host.h> +#include <linux/nospec.h> #include <trace/events/kvm.h> #include "trace.h" @@ -53,11 +54,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 *pdata) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - *pdata = hv->hv_crash_param[index]; + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } @@ -96,11 +98,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 data) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - hv->hv_crash_param[index] = data; + hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..791850bfc981 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -456,46 +456,37 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: + pic_lock(s); + pic_ioport_write(&s->pics[0], addr, data); + pic_unlock(s); + break; case 0xa0: case 0xa1: - pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_lock(s); + pic_ioport_write(&s->pics[1], addr, data); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], addr, data); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - pic_unlock(s); return 0; } @@ -503,29 +494,31 @@ static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: + pic_lock(s); data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } *(unsigned char *)val = data; - pic_unlock(s); return 0; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d380111351c0..086833ecb9f2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -36,6 +36,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/nospec.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/current.h> @@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, default: { u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; - u64 redir_content; + u64 redir_content = ~0ULL; - if (redir_index < IOAPIC_NUM_PINS) - redir_content = - ioapic->redirtbl[redir_index].bits; - else - redir_content = ~0ULL; + if (redir_index < IOAPIC_NUM_PINS) { + u32 index = array_index_nospec( + redir_index, IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[index].bits; + } result = (ioapic->ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : @@ -289,6 +291,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; + index = array_index_nospec(index, IOAPIC_NUM_PINS); e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; /* Preserve read-only fields */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3c70f6c76d3a..078b2176f2a2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,6 +36,7 @@ #include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> +#include <linux/nospec.h> #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" @@ -535,9 +536,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) { u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) + if (pv_eoi_get_user(vcpu, &val) < 0) { apic_debug("Can't read EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); + return false; + } return val & 0x1; } @@ -1432,15 +1435,21 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: + case APIC_LVTERR: { /* TODO: Check vector */ + size_t size; + u32 index; + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + size = ARRAY_SIZE(apic_lvt_mask); + index = array_index_nospec( + (reg - APIC_LVTT) >> 4, size); + val &= apic_lvt_mask[index]; apic_set_reg(apic, reg, val); - break; + } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a5b533aea958..2ff0fe32c015 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3679,7 +3679,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 0149ac59c273..3e3016411020 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -17,6 +17,7 @@ */ #include <linux/kvm_host.h> +#include <linux/nospec.h> #include <asm/mtrr.h> #include "cpuid.h" @@ -202,11 +203,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) break; case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: *seg = 1; - *unit = msr - MSR_MTRRfix16K_80000; + *unit = array_index_nospec( + msr - MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); break; case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: *seg = 2; - *unit = msr - MSR_MTRRfix4K_C0000; + *unit = array_index_nospec( + msr - MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); break; default: return false; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f96e1f962587..fbf3d25af765 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -1,6 +1,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#include <linux/nospec.h> + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -80,8 +82,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc) static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) { - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_gp_counters); + + return &pmu->gp_counters[index]; + } return NULL; } @@ -91,8 +97,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) { int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_fixed_counters); + + return &pmu->fixed_counters[index]; + } return NULL; } diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 8fc07ea23344..04890ac518d0 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu, static unsigned intel_find_fixed_event(int idx) { - if (idx >= ARRAY_SIZE(fixed_pmc_events)) + u32 event; + size_t size = ARRAY_SIZE(fixed_pmc_events); + + if (idx >= size) return PERF_COUNT_HW_MAX; - return intel_arch_events[fixed_pmc_events[idx]].event_type; + event = fixed_pmc_events[array_index_nospec(idx, size)]; + return intel_arch_events[event].event_type; } /* check if a PMC is enabled by comparising it with globl_ctrl bits. */ @@ -131,15 +135,19 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); struct kvm_pmc *counters; + unsigned int num_counters; idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) + if (fixed) { + counters = pmu->fixed_counters; + num_counters = pmu->nr_arch_fixed_counters; + } else { + counters = pmu->gp_counters; + num_counters = pmu->nr_arch_gp_counters; + } + if (idx >= num_counters) return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - - return &counters[idx]; + return &counters[array_index_nospec(idx, num_counters)]; } static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9fc536657492..77bee73faebc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2250,7 +2250,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; + /* asid not copied, it is handled manually for svm->vmcb. */ dst->tlb_ctl = from->tlb_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; @@ -2564,7 +2564,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.intercept = nested_vmcb->control.intercept; svm_flush_tlb(&svm->vcpu); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl & + (V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK); + + svm->vmcb->control.int_ctl |= V_INTR_MASKING_MASK; + if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; else @@ -2927,7 +2931,7 @@ static int cr_interception(struct vcpu_svm *svm) err = 0; if (cr >= 16) { /* mov to cr */ cr -= 16; - val = kvm_register_read(&svm->vcpu, reg); + val = kvm_register_readl(&svm->vcpu, reg); switch (cr) { case 0: if (!check_selective_cr0_intercepted(svm, val)) @@ -2972,7 +2976,7 @@ static int cr_interception(struct vcpu_svm *svm) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } - kvm_register_write(&svm->vcpu, reg, val); + kvm_register_writel(&svm->vcpu, reg, val); } kvm_complete_insn_gp(&svm->vcpu, err); @@ -3004,13 +3008,13 @@ static int dr_interception(struct vcpu_svm *svm) if (dr >= 16) { /* mov to DRn */ if (!kvm_require_dr(&svm->vcpu, dr - 16)) return 1; - val = kvm_register_read(&svm->vcpu, reg); + val = kvm_register_readl(&svm->vcpu, reg); kvm_set_dr(&svm->vcpu, dr - 16, val); } else { if (!kvm_require_dr(&svm->vcpu, dr)) return 1; kvm_get_dr(&svm->vcpu, dr, &val); - kvm_register_write(&svm->vcpu, reg, val); + kvm_register_writel(&svm->vcpu, reg, val); } skip_emulated_instruction(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9344ac6b4f99..6646edaa5123 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4275,6 +4275,26 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) (ss.selector & SEGMENT_RPL_MASK)); } +static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, + unsigned int port, int size); +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + unsigned short port; + int size; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + return nested_vmx_check_io_bitmaps(vcpu, port, size); +} + /* * Check if guest state is valid. Returns true if valid, false if * not. @@ -5421,7 +5441,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, */ static void kvm_machine_check(void) { -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_MCE) struct pt_regs regs = { .cs = 3, /* Fake ring 3 no matter what the guest ran on */ .flags = X86_EFLAGS_IF, @@ -6167,8 +6187,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; } else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } ret = handle_mmio_page_fault(vcpu, gpa, true); @@ -7261,8 +7281,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* _system ok, as nested_vmx_check_permission verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, &field_value, (is_long_mode(vcpu) ? 8 : 4), - &e)) + &e)) { kvm_inject_page_fault(vcpu, &e); + return 1; + } } nested_vmx_succeed(vcpu); @@ -7622,23 +7644,17 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Return true if an IO instruction with the specified port and size should cause + * a VM-exit into L1. + */ +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size) { - unsigned long exit_qualification; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gpa_t bitmap, last_bitmap; - unsigned int port; - int size; u8 b; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - last_bitmap = (gpa_t)-1; b = -1; @@ -7828,7 +7844,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return true; } - switch (exit_reason) { + switch ((u16)exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (is_nmi(intr_info)) return false; @@ -8219,6 +8235,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -10738,11 +10755,71 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, to_vmx(vcpu)->nested.sync_shadow_vmcs = true; } +static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned short port; + bool intercept; + int size; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + port = info->src_val; + size = info->dst_bytes; + } else { + port = info->dst_val; + size = info->src_bytes; + } + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + intercept = nested_cpu_has(vmcs12, + CPU_BASED_UNCOND_IO_EXITING); + else + intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + + return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return X86EMUL_CONTINUE; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + switch (info->intercept) { + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ + case x86_intercept_rdtscp: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + break; + + case x86_intercept_in: + case x86_intercept_ins: + case x86_intercept_out: + case x86_intercept_outs: + return vmx_check_intercept_io(vcpu, info); + + /* TODO: check more intercepts... */ + default: + break; + } + + return X86EMUL_UNHANDLEABLE; } static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b711cd261d7..910100257df9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -53,6 +53,7 @@ #include <linux/pvclock_gtod.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> +#include <linux/nospec.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -260,13 +261,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -872,9 +874,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - vcpu->arch.db[dr] = val; + vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; @@ -911,9 +915,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[dr]; + *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: /* fall through */ @@ -1023,10 +1029,15 @@ u64 kvm_get_arch_capabilities(void) * If TSX is disabled on the system, guests are also mitigated against * TAA and clear CPU buffer mitigation is not required for guests. */ - if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && - (data & ARCH_CAP_TSX_CTRL_MSR)) + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~ARCH_CAP_TAA_NO; + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; + else if (data & ARCH_CAP_TSX_CTRL_MSR) data &= ~ARCH_CAP_MDS_NO; + /* KVM does not emulate MSR_IA32_TSX_CTRL. */ + data &= ~ARCH_CAP_TSX_CTRL_MSR; return data; } @@ -1983,7 +1994,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore @@ -2148,7 +2162,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); @@ -2158,6 +2172,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -2344,7 +2362,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; break; } @@ -2415,7 +2436,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: @@ -2924,7 +2945,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) goto out; @@ -3996,10 +4017,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); +set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@ -4019,10 +4043,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); +set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@ -4390,13 +4417,6 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; - /* - * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED - * is returned, but our callers are not ready for that and they blindly - * call kvm_inject_page_fault. Ensure that they at least do not leak - * uninitialized kernel stack memory into cr2 and error code. - */ - memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } @@ -4404,6 +4424,13 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -5868,14 +5895,12 @@ static void kvm_set_mmio_spte_mask(void) /* Set the present bit. */ mask |= 1ull; -#ifdef CONFIG_X86_64 /* * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ if (maxphyaddr == 52) mask &= ~1ull; -#endif kvm_mmu_set_mmio_spte_mask(mask); } @@ -5995,6 +6020,7 @@ void kvm_arch_exit(void) unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); @@ -6702,6 +6728,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); } kvm_x86_ops->run(vcpu); @@ -7481,7 +7509,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_arch_vcpu_free(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -7967,6 +7995,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { unsigned long ugfn; int lpages; @@ -8035,6 +8070,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_MOVE) + return kvm_arch_create_memslot(kvm, memslot, + mem->memory_size >> PAGE_SHIFT); + return 0; } diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 518532e6a3fa..8a3bc242c5e9 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -239,7 +239,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info) rv->err = wrmsr_safe_regs(rv->regs); } -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; @@ -252,7 +252,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..3b40c98bbbd4 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -208,7 +208,7 @@ sqrt_stage_2_finish: #ifdef PARANOID /* It should be possible to get here only if the arg is ffff....ffff */ - cmp $0xffffffff,FPU_fsqrt_arg_1 + cmpl $0xffffffff,FPU_fsqrt_arg_1 jnz sqrt_stage_2_error #endif /* PARANOID */ diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0f1c6fc3ddd8..47770ccab6d7 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -15,6 +15,7 @@ #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <asm/pgtable.h> @@ -407,6 +408,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, } else note_page(m, &st, __pgprot(0), 1); + cond_resched(); start++; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0e498683295b..add7605bbd61 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -228,7 +228,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -void vmalloc_sync_all(void) +static void vmalloc_sync(void) { unsigned long address; @@ -255,6 +255,16 @@ void vmalloc_sync_all(void) } } +void vmalloc_sync_mappings(void) +{ + vmalloc_sync(); +} + +void vmalloc_sync_unmappings(void) +{ + vmalloc_sync(); +} + /* * 32-bit: * @@ -349,11 +359,23 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_all(void) +void vmalloc_sync_mappings(void) { + /* + * 64-bit mappings might allocate new p4d/pud pages + * that need to be propagated to all tasks' PGDs. + */ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); } +void vmalloc_sync_unmappings(void) +{ + /* + * Unmappings never allocate or free p4d/pud pages. + * No work is required here. + */ +} + /* * 64-bit: * diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 7d2542ad346a..6612d532e42e 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -95,7 +95,10 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - get_page(page); + if (unlikely(!try_get_page(page))) { + pte_unmap(ptep); + return 0; + } SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -132,6 +135,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); + if (WARN_ON_ONCE(page_ref_count(head) <= 0)) + return 0; page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); @@ -208,6 +213,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, refs = 0; head = pud_page(pud); + if (WARN_ON_ONCE(page_ref_count(head) <= 0)) + return 0; page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f00eb52c16a6..17eb564901ca 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -109,8 +109,6 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; - printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", - pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 97b6b0164dcb..a935039c20be 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1179,21 +1179,21 @@ int kern_addr_valid(unsigned long addr) return 0; pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + if (!pud_present(*pud)) return 0; if (pud_large(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) + if (!pte_present(*pte)) return 0; return pfn_valid(pte_pfn(*pte)); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 0057a7accfb1..5448ad4d0703 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -385,7 +385,7 @@ static void enter_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL && + if (!cpumask_available(downed_cpus) && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { pr_notice("Failed to allocate mask\n"); goto out; @@ -415,7 +415,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50f75768aadd..289518bb0e8d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -567,8 +567,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } @@ -720,6 +720,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } free_page((unsigned long)pmd_sv); + + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); return 1; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index dd9a861fd526..4d3b0540279f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,8 +15,6 @@ #include <asm/nospec-branch.h> #include <linux/bpf.h> -int bpf_jit_enable __read_mostly; - /* * assembly code in arch/x86/net/bpf_jit.S */ @@ -145,6 +143,19 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_9)); } +/* + * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64 + * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte + * of encoding. al,cl,dl,bl have simpler encoding. + */ +static bool is_ereg_8l(u32 reg) +{ + return is_ereg(reg) || + (1 << reg) & (BIT(BPF_REG_1) | + BIT(BPF_REG_2) | + BIT(BPF_REG_FP)); +} + /* add modifiers if 'reg' maps to x64 registers r8..r15 */ static u8 add_1mod(u8 byte, u32 reg) { @@ -194,7 +205,7 @@ struct jit_context { 32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 51 +#define PROLOGUE_SIZE 48 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program @@ -230,11 +241,15 @@ static void emit_prologue(u8 **pprog) /* mov qword ptr [rbp-X],r15 */ EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); - /* clear A and X registers */ - EMIT2(0x31, 0xc0); /* xor eax, eax */ - EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls + * we need to reset the counter to 0. It's done in two instructions, + * resetting rax register to 0 (xor on eax gets 0 extended), and + * moving it to the counter location. + */ - /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp-X], rax */ EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); @@ -731,9 +746,8 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: /* emit 'mov byte ptr [rax + off], al' */ - if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* have to add extra byte for x86 SIL, DIL regs */ - src_reg == BPF_REG_1 || src_reg == BPF_REG_2) + if (is_ereg(dst_reg) || is_ereg_8l(src_reg)) + /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); else EMIT1(0x88); @@ -841,7 +855,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; @@ -1021,13 +1035,22 @@ common_load: ilen = prog - temp; if (ilen > BPF_MAX_INSN_SIZE) { - pr_err("bpf_jit_compile fatal insn size error\n"); + pr_err("bpf_jit: fatal insn size error\n"); return -EFAULT; } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { - pr_err("bpf_jit_compile fatal error\n"); + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { + pr_err("bpf_jit: fatal error\n"); return -EFAULT; } memcpy(image + proglen, temp, ilen); @@ -1039,11 +1062,7 @@ common_load: return proglen; } -void bpf_jit_compile(struct bpf_prog *prog) -{ -} - -void bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; @@ -1054,14 +1073,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog) int i; if (!bpf_jit_enable) - return; + return prog; if (!prog || !prog->len) return; addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) - return; + return prog; /* Before first pass, make a rough estimation of addrs[] * each bpf instruction is translated to less than 64 bytes @@ -1114,6 +1133,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } out: kfree(addrs); + return prog; } void bpf_jit_free(struct bpf_prog *fp) diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 4e664bdb535a..cb31a4440e58 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { unsigned int *depth = data; if ((*depth)--) oprofile_add_trace(addr); + return 0; } static struct stacktrace_ops backtrace_ops = { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f24bd7249536..a8bb10a78bc7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -614,7 +614,7 @@ enum __force_cpu_type { static int force_cpu_type; -static int set_cpu_type(const char *str, struct kernel_param *kp) +static int set_cpu_type(const char *str, const struct kernel_param *kp) { if (!strcmp(str, "timer")) { force_cpu_type = timer; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 89f90549c6a8..c05b57e2edb0 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -542,6 +542,17 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); /* + * Device [1022:7914] + * When in D0, PME# doesn't get asserted when plugging USB 2.0 device. + */ +static void pci_fixup_amd_fch_xhci_pme(struct pci_dev *dev) +{ + dev_info(&dev->dev, "PME# does not work under D0, disabling it\n"); + dev->pme_support &= ~(PCI_PM_CAP_PME_D0 >> PCI_PM_CAP_PME_SHIFT); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7914, pci_fixup_amd_fch_xhci_pme); + +/* * Broadwell EP Home Agent BARs erroneously return non-zero values when read. * * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 4bc352fc08f1..105872617be0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -465,7 +465,6 @@ void __init efi_init(void) efi_char16_t *c16; char vendor[100] = "unknown"; int i = 0; - void *tmp; #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || @@ -490,14 +489,16 @@ void __init efi_init(void) /* * Show what we know for posterity */ - c16 = tmp = early_memremap(efi.systab->fw_vendor, 2); + c16 = early_memremap_ro(efi.systab->fw_vendor, + sizeof(vendor) * sizeof(efi_char16_t)); if (c16) { - for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = *c16++; + for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i) + vendor[i] = c16[i]; vendor[i] = '\0'; - } else + early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); + } else { pr_err("Could not map the firmware vendor!\n"); - early_memunmap(tmp, 2); + } pr_info("EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index fd1ab80be0de..a4cf678cf5c8 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -10,6 +10,7 @@ BEGIN { /^GNU objdump/ { verstr = "" + gsub(/\(.*\)/, ""); for (i = 3; i <= NF; i++) if (match($(i), "^[0-9]")) { verstr = $(i); diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5b6c8486a0be..d1c3f82c7882 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -839,9 +839,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; @@ -882,9 +884,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index e6552275320b..40ecacb2c54b 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> +#include <registers.h> long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) { @@ -32,7 +33,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) switch (code) { case ARCH_SET_FS: case ARCH_SET_GS: - ret = restore_registers(pid, ¤t->thread.regs.regs); + ret = restore_pid_registers(pid, ¤t->thread.regs.regs); if (ret) return ret; break; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 82fd84d5e1aa..28725a6ed5de 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -861,8 +861,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -872,17 +872,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -892,6 +893,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -899,7 +901,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -1240,10 +1243,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .iret = xen_iret, #ifdef CONFIG_X86_64 - .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, -#else - .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index cab9f766bb06..af0ebe18248a 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -623,8 +623,8 @@ int xen_alloc_p2m_entry(unsigned long pfn) } /* Expanded the p2m? */ - if (pfn > xen_p2m_last_pfn) { - xen_p2m_last_pfn = pfn; + if (pfn >= xen_p2m_last_pfn) { + xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE); HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } @@ -723,9 +723,12 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, for (i = 0; i < count; i++) { unsigned long mfn, pfn; + struct gnttab_unmap_grant_ref unmap[2]; + int rc; /* Do not add to override if the map failed. */ - if (map_ops[i].status) + if (map_ops[i].status != GNTST_okay || + (kmap_ops && kmap_ops[i].status != GNTST_okay)) continue; if (map_ops[i].flags & GNTMAP_contains_pte) { @@ -739,10 +742,46 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned"); - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; + if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + continue; + + /* + * Signal an error for this slot. This in turn requires + * immediate unmapping. + */ + map_ops[i].status = GNTST_general_error; + unmap[0].host_addr = map_ops[i].host_addr, + unmap[0].handle = map_ops[i].handle; + map_ops[i].handle = ~0; + if (map_ops[i].flags & GNTMAP_device_map) + unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; + else + unmap[0].dev_bus_addr = 0; + + if (kmap_ops) { + kmap_ops[i].status = GNTST_general_error; + unmap[1].host_addr = kmap_ops[i].host_addr, + unmap[1].handle = kmap_ops[i].handle; + kmap_ops[i].handle = ~0; + if (kmap_ops[i].flags & GNTMAP_device_map) + unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; + else + unmap[1].dev_bus_addr = 0; } + + /* + * Pre-populate both status fields, to be recognizable in + * the log message below. + */ + unmap[0].status = 1; + unmap[1].status = 1; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, 1 + !!kmap_ops); + if (rc || unmap[0].status != GNTST_okay || + unmap[1].status != GNTST_okay) + pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n", + rc, unmap[0].status, unmap[1].status); } out: @@ -763,17 +802,15 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT)) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + else ret = -EINVAL; - goto out; - } - - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } if (kunmap_ops) ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - kunmap_ops, count); -out: + kunmap_ops, count) ?: ret; + return ret; } EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 29e50d1229bc..ee48506ca151 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -116,6 +116,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu) #endif cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); + prevent_tail_call_optimization(); } static void xen_smp_intr_free(unsigned int cpu) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 85872a08994a..e9fc0f7df0da 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -301,10 +301,20 @@ void xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { + int irq; + if (!xen_pvspin) return; - unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); + /* + * When booting the kernel with 'mitigations=auto,nosmt', the secondary + * CPUs are not activated, and lock_kicker_irq is not initialized. + */ + irq = per_cpu(lock_kicker_irq, cpu); + if (irq == -1) + return; + + unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; kfree(per_cpu(irq_name, cpu)); per_cpu(irq_name, cpu) = NULL; diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index fd92a64d748e..feb6d40a0860 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -35,20 +35,6 @@ check_events: ret /* - * We can't use sysexit directly, because we're not running in ring0. - * But we can easily fake it up using iret. Assuming xen_sysexit is - * jumped to with a standard stack frame, we can just strip it back to - * a standard iret frame and use iret. - */ -ENTRY(xen_sysexit) - movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ - orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) - lea PT_EIP(%esp), %esp - - jmp xen_iret -ENDPROC(xen_sysexit) - -/* * This is run where a normal iret would be run, with the same stack setup: * 8: eflags * 4: cs diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1399423f3418..4140b070f2e9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -#ifdef CONFIG_X86_32 -__visible void xen_sysexit(void); -#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); |