diff options
| author | Linux Build Service Account <lnxbuild@localhost> | 2018-08-14 08:35:07 -0700 |
|---|---|---|
| committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2018-08-14 08:35:07 -0700 |
| commit | 601ae48764031462cfd3b2f62b2d712a3447978b (patch) | |
| tree | d472bdcd39955a6eda679fa9af1b64f49da6943c | |
| parent | 41155385a0a28dab926359f75a344a6183fd4195 (diff) | |
| parent | 499dddad34fa922f92339b98ff7f31b21be69893 (diff) | |
Merge "Merge android-4.4.145 (05670d3) into msm-4.4"
115 files changed, 2620 insertions, 482 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ea6a043f5beb..50f95689ab38 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -276,6 +276,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/meltdown /sys/devices/system/cpu/vulnerabilities/spectre_v1 /sys/devices/system/cpu/vulnerabilities/spectre_v2 + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass Date: January 2018 Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> Description: Information about CPU vulnerabilities diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0ea9ef13f758..40686dc931b9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2530,6 +2530,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. allow data leaks with this option, which is equivalent to spectre_v2=off. + nospec_store_bypass_disable + [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -3702,6 +3705,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Not specifying this option is equivalent to spectre_v2=auto. + spec_store_bypass_disable= + [HW] Control Speculative Store Bypass (SSB) Disable mitigation + (Speculative Store Bypass vulnerability) + + Certain CPUs are vulnerable to an exploit against a + a common industry wide performance optimization known + as "Speculative Store Bypass" in which recent stores + to the same memory location may not be observed by + later loads during speculative execution. The idea + is that such stores are unlikely and that they can + be detected prior to instruction retirement at the + end of a particular speculation execution window. + + In vulnerable processors, the speculatively forwarded + store can be used in a cache side channel attack, for + example to read memory to which the attacker does not + directly have access (e.g. inside sandboxed code). + + This parameter controls whether the Speculative Store + Bypass optimization is used. + + on - Unconditionally disable Speculative Store Bypass + off - Unconditionally enable Speculative Store Bypass + auto - Kernel detects whether the CPU model contains an + implementation of Speculative Store Bypass and + picks the most appropriate mitigation. If the + CPU is not vulnerable, "off" is selected. If the + CPU is vulnerable the default mitigation is + architecture and Kconfig dependent. See below. + prctl - Control Speculative Store Bypass per thread + via prctl. Speculative Store Bypass is enabled + for a process by default. The state of the control + is inherited on fork. + seccomp - Same as "prctl" above, but all seccomp threads + will disable SSB unless they explicitly opt out. + + Not specifying this option is equivalent to + spec_store_bypass_disable=auto. + + Default mitigations: + X86: If CONFIG_SECCOMP=y "seccomp", otherwise "prctl" + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= diff --git a/Documentation/spec_ctrl.txt b/Documentation/spec_ctrl.txt new file mode 100644 index 000000000000..32f3d55c54b7 --- /dev/null +++ b/Documentation/spec_ctrl.txt @@ -0,0 +1,94 @@ +=================== +Speculation Control +=================== + +Quite some CPUs have speculation-related misfeatures which are in +fact vulnerabilities causing data leaks in various forms even across +privilege domains. + +The kernel provides mitigation for such vulnerabilities in various +forms. Some of these mitigations are compile-time configurable and some +can be supplied on the kernel command line. + +There is also a class of mitigations which are very expensive, but they can +be restricted to a certain set of processes or tasks in controlled +environments. The mechanism to control these mitigations is via +:manpage:`prctl(2)`. + +There are two prctl options which are related to this: + + * PR_GET_SPECULATION_CTRL + + * PR_SET_SPECULATION_CTRL + +PR_GET_SPECULATION_CTRL +----------------------- + +PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature +which is selected with arg2 of prctl(2). The return value uses bits 0-3 with +the following meaning: + +==== ===================== =================================================== +Bit Define Description +==== ===================== =================================================== +0 PR_SPEC_PRCTL Mitigation can be controlled per task by + PR_SET_SPECULATION_CTRL. +1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is + disabled. +2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is + enabled. +3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A + subsequent prctl(..., PR_SPEC_ENABLE) will fail. +==== ===================== =================================================== + +If all bits are 0 the CPU is not affected by the speculation misfeature. + +If PR_SPEC_PRCTL is set, then the per-task control of the mitigation is +available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation +misfeature will fail. + +PR_SET_SPECULATION_CTRL +----------------------- + +PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which +is selected by arg2 of :manpage:`prctl(2)` per task. arg3 is used to hand +in the control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE or +PR_SPEC_FORCE_DISABLE. + +Common error codes +------------------ +======= ================================================================= +Value Meaning +======= ================================================================= +EINVAL The prctl is not implemented by the architecture or unused + prctl(2) arguments are not 0. + +ENODEV arg2 is selecting a not supported speculation misfeature. +======= ================================================================= + +PR_SET_SPECULATION_CTRL error codes +----------------------------------- +======= ================================================================= +Value Meaning +======= ================================================================= +0 Success + +ERANGE arg3 is incorrect, i.e. it's neither PR_SPEC_ENABLE nor + PR_SPEC_DISABLE nor PR_SPEC_FORCE_DISABLE. + +ENXIO Control of the selected speculation misfeature is not possible. + See PR_GET_SPECULATION_CTRL. + +EPERM Speculation was disabled with PR_SPEC_FORCE_DISABLE and caller + tried to enable it again. +======= ================================================================= + +Speculation misfeature controls +------------------------------- +- PR_SPEC_STORE_BYPASS: Speculative Store Bypass + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 142 +SUBLEVEL = 145 EXTRAVERSION = NAME = Blurry Fish Butt @@ -631,6 +631,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) +KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += $(call cc-option,-Oz,-Os) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 429957f1c236..8f1145ed0046 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -102,7 +102,7 @@ typedef pte_t * pgtable_t; #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) /* Default Permissions for stack/heaps pages (Non Executable) */ -#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE) +#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define WANT_PAGE_VIRTUAL 1 diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index e5fec320f158..c07d7b0a4058 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -372,7 +372,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* Decode a PTE containing swap "identifier "into constituents */ #define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f) -#define __swp_offset(pte_lookalike) ((pte_lookalike).val << 13) +#define __swp_offset(pte_lookalike) ((pte_lookalike).val >> 13) /* NOPs, to keep generic kernel happy */ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 7fb59199c6bb..7665bd2f4871 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -251,7 +251,7 @@ extern int __put_user_8(void *, unsigned long long); ({ \ unsigned long __limit = current_thread_info()->addr_limit - 1; \ const typeof(*(p)) __user *__tmp_p = (p); \ - register const typeof(*(p)) __r2 asm("r2") = (x); \ + register typeof(*(p)) __r2 asm("r2") = (x); \ register const typeof(*(p)) __user *__p asm("r0") = __tmp_p; \ register unsigned long __l asm("r1") = __limit; \ register int __e asm("r0"); \ diff --git a/arch/mips/ath79/common.c b/arch/mips/ath79/common.c index 8ae4067a5eda..40ecb6e700cd 100644 --- a/arch/mips/ath79/common.c +++ b/arch/mips/ath79/common.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(ath79_ddr_ctrl_init); void ath79_ddr_wb_flush(u32 reg) { - void __iomem *flush_reg = ath79_ddr_wb_flush_base + reg; + void __iomem *flush_reg = ath79_ddr_wb_flush_base + (reg * 4); /* Flush the DDR write buffer. */ __raw_writel(0x1, flush_reg); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 054a22c0873b..9684a0d22d97 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -633,21 +633,48 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ALMASK; } +static DEFINE_PER_CPU(struct call_single_data, backtrace_csd); +static struct cpumask backtrace_csd_busy; + static void arch_dump_stack(void *info) { struct pt_regs *regs; + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + arch_spin_lock(&lock); regs = get_irq_regs(); if (regs) show_regs(regs); + else + dump_stack(); + arch_spin_unlock(&lock); - dump_stack(); + cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy); } void arch_trigger_all_cpu_backtrace(bool include_self) { - smp_call_function(arch_dump_stack, NULL, 1); + struct call_single_data *csd; + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + /* + * If we previously sent an IPI to the target CPU & it hasn't + * cleared its bit in the busy cpumask then it didn't handle + * our previous IPI & it's not safe for us to reuse the + * call_single_data_t. + */ + if (cpumask_test_and_set_cpu(cpu, &backtrace_csd_busy)) { + pr_warn("Unable to send backtrace IPI to CPU%u - perhaps it hung?\n", + cpu); + continue; + } + + csd = &per_cpu(backtrace_csd, cpu); + csd->func = arch_dump_stack; + smp_call_function_single_async(cpu, csd); + } } int mips_get_process_fp_mode(struct task_struct *task) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e23f4775b10b..da6997486709 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -345,6 +345,7 @@ static void __show_regs(const struct pt_regs *regs) void show_regs(struct pt_regs *regs) { __show_regs((struct pt_regs *)regs); + dump_stack(); } void show_registers(struct pt_regs *regs) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d03bf0e28b8b..48c27c3fdfdb 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -79,24 +79,33 @@ ENTRY(entry_SYSENTER_compat) ASM_CLAC /* Clear AC after saving FLAGS */ pushq $__USER32_CS /* pt_regs->cs */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->ip = 0 (placeholder) */ + pushq $0 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + xorq %r8, %r8 /* nospec r8 */ + pushq $0 /* pt_regs->r9 = 0 */ + xorq %r9, %r9 /* nospec r9 */ + pushq $0 /* pt_regs->r10 = 0 */ + xorq %r10, %r10 /* nospec r10 */ + pushq $0 /* pt_regs->r11 = 0 */ + xorq %r11, %r11 /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + xorl %ebp, %ebp /* nospec rbp */ + pushq $0 /* pt_regs->r12 = 0 */ + xorq %r12, %r12 /* nospec r12 */ + pushq $0 /* pt_regs->r13 = 0 */ + xorq %r13, %r13 /* nospec r13 */ + pushq $0 /* pt_regs->r14 = 0 */ + xorq %r14, %r14 /* nospec r14 */ + pushq $0 /* pt_regs->r15 = 0 */ + xorq %r15, %r15 /* nospec r15 */ cld /* @@ -185,17 +194,26 @@ ENTRY(entry_SYSCALL_compat) pushq %rdx /* pt_regs->dx */ pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + xorq %r8, %r8 /* nospec r8 */ + pushq $0 /* pt_regs->r9 = 0 */ + xorq %r9, %r9 /* nospec r9 */ + pushq $0 /* pt_regs->r10 = 0 */ + xorq %r10, %r10 /* nospec r10 */ + pushq $0 /* pt_regs->r11 = 0 */ + xorq %r11, %r11 /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + xorl %ebp, %ebp /* nospec rbp */ + pushq $0 /* pt_regs->r12 = 0 */ + xorq %r12, %r12 /* nospec r12 */ + pushq $0 /* pt_regs->r13 = 0 */ + xorq %r13, %r13 /* nospec r13 */ + pushq $0 /* pt_regs->r14 = 0 */ + xorq %r14, %r14 /* nospec r14 */ + pushq $0 /* pt_regs->r15 = 0 */ + xorq %r15, %r15 /* nospec r15 */ /* * User mode is traced as though IRQs are on, and SYSENTER @@ -292,17 +310,26 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + xorq %r8, %r8 /* nospec r8 */ + pushq $0 /* pt_regs->r9 = 0 */ + xorq %r9, %r9 /* nospec r9 */ + pushq $0 /* pt_regs->r10 = 0 */ + xorq %r10, %r10 /* nospec r10 */ + pushq $0 /* pt_regs->r11 = 0 */ + xorq %r11, %r11 /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp */ + xorl %ebp, %ebp /* nospec rbp */ pushq %r12 /* pt_regs->r12 */ + xorq %r12, %r12 /* nospec r12 */ pushq %r13 /* pt_regs->r13 */ + xorq %r13, %r13 /* nospec r13 */ pushq %r14 /* pt_regs->r14 */ + xorq %r14, %r14 /* nospec r14 */ pushq %r15 /* pt_regs->r15 */ + xorq %r15, %r15 /* nospec r15 */ cld /* diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 20370c6db74b..3d1ec41ae09a 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -6,6 +6,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_APM_H #define _ASM_X86_MACH_DEFAULT_APM_H +#include <asm/nospec-branch.h> + #ifdef APM_ZERO_SEGS # define APM_DO_ZERO_SEGS \ "pushl %%ds\n\t" \ @@ -31,6 +33,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ + firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -43,6 +46,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, "=S" (*esi) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); + firmware_restrict_branch_speculation_end(); } static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, @@ -55,6 +59,7 @@ static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ + firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -67,6 +72,7 @@ static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, "=S" (si) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); + firmware_restrict_branch_speculation_end(); return error; } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 858f8c354cea..f3d4f1edc947 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -45,6 +45,65 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +#ifndef __x86_64__ +/* 32 bit */ + +#define _ASM_ARG1 _ASM_AX +#define _ASM_ARG2 _ASM_DX +#define _ASM_ARG3 _ASM_CX + +#define _ASM_ARG1L eax +#define _ASM_ARG2L edx +#define _ASM_ARG3L ecx + +#define _ASM_ARG1W ax +#define _ASM_ARG2W dx +#define _ASM_ARG3W cx + +#define _ASM_ARG1B al +#define _ASM_ARG2B dl +#define _ASM_ARG3B cl + +#else +/* 64 bit */ + +#define _ASM_ARG1 _ASM_DI +#define _ASM_ARG2 _ASM_SI +#define _ASM_ARG3 _ASM_DX +#define _ASM_ARG4 _ASM_CX +#define _ASM_ARG5 r8 +#define _ASM_ARG6 r9 + +#define _ASM_ARG1Q rdi +#define _ASM_ARG2Q rsi +#define _ASM_ARG3Q rdx +#define _ASM_ARG4Q rcx +#define _ASM_ARG5Q r8 +#define _ASM_ARG6Q r9 + +#define _ASM_ARG1L edi +#define _ASM_ARG2L esi +#define _ASM_ARG3L edx +#define _ASM_ARG4L ecx +#define _ASM_ARG5L r8d +#define _ASM_ARG6L r9d + +#define _ASM_ARG1W di +#define _ASM_ARG2W si +#define _ASM_ARG3W dx +#define _ASM_ARG4W cx +#define _ASM_ARG5W r8w +#define _ASM_ARG6W r9w + +#define _ASM_ARG1B dil +#define _ASM_ARG2B sil +#define _ASM_ARG3B dl +#define _ASM_ARG4B cl +#define _ASM_ARG5B r8b +#define _ASM_ARG6B r9b + +#endif + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE(from,to) \ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index e3a6f66d288c..7f5dcb64cedb 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -40,7 +40,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, asm volatile ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) - :"r"(size),"r" (index) + :"g"(size),"r" (index) :"cc"); return mask; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index dd0089841a0f..d72c1db64679 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -28,6 +28,7 @@ enum cpuid_leafs CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, + CPUID_7_EDX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -78,8 +79,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -100,8 +102,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 205ce70c1d6c..f4b175db70f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -12,7 +12,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NCAPINTS 19 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -194,13 +194,28 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ + +#define X86_FEATURE_RETPOLINE ( 7*32+29) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* "" AMD Retpoline mitigation for Spectre variant 2 */ + +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled*/ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation */ + +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ + + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ @@ -251,6 +266,10 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -285,6 +304,15 @@ #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ + /* * BUG word(s) */ @@ -302,5 +330,6 @@ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 21c5ac15657b..1f8cca459c6c 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -59,6 +59,7 @@ #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) #define DISABLED_MASK17 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define DISABLED_MASK18 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 08b1f2f6ea50..cfde088f8e95 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,7 @@ #include <asm/fpu/api.h> #include <asm/pgtable.h> +#include <asm/nospec-branch.h> /* * We map the EFI regions needed for runtime services non-contiguously, @@ -41,8 +42,10 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); ({ \ efi_status_t __s; \ kernel_fpu_begin(); \ + firmware_restrict_branch_speculation_start(); \ __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ efi.systab->runtime->f)(args); \ + firmware_restrict_branch_speculation_end(); \ kernel_fpu_end(); \ __s; \ }) @@ -51,8 +54,10 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define __efi_call_virt(f, args...) \ ({ \ kernel_fpu_begin(); \ + firmware_restrict_branch_speculation_start(); \ ((efi_##f##_t __attribute__((regparm(0)))*) \ efi.systab->runtime->f)(args); \ + firmware_restrict_branch_speculation_end(); \ kernel_fpu_end(); \ }) @@ -73,7 +78,9 @@ extern u64 asmlinkage efi_call(void *fp, ...); efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ + firmware_restrict_branch_speculation_start(); \ __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ + firmware_restrict_branch_speculation_end(); \ __kernel_fpu_end(); \ preempt_enable(); \ __s; \ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6999f7d01a0d..e13ff5a14633 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -12,6 +12,7 @@ */ #define INTEL_FAM6_CORE_YONAH 0x0E + #define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_FAM6_CORE2_PENRYN 0x17 @@ -20,6 +21,7 @@ #define INTEL_FAM6_NEHALEM 0x1E #define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_FAM6_NEHALEM_EX 0x2E + #define INTEL_FAM6_WESTMERE 0x25 #define INTEL_FAM6_WESTMERE2 0x1F #define INTEL_FAM6_WESTMERE_EP 0x2C @@ -36,9 +38,9 @@ #define INTEL_FAM6_HASWELL_GT3E 0x46 #define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_BROADWELL_GT3E 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E @@ -56,13 +58,15 @@ #define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ -#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ +#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index b77f5edb03b0..0056bc945cd1 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -8,7 +8,7 @@ * Interrupt control: */ -static inline unsigned long native_save_fl(void) +extern inline unsigned long native_save_fl(void) { unsigned long flags; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 7680b76adafc..3359dfedc7ee 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,18 @@ #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/atomic.h> /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -24,6 +30,11 @@ typedef struct { atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 9bfc5fd77015..effc12767cbf 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -11,6 +11,9 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -52,15 +55,15 @@ struct ldt_struct { /* * Used for LDT copy/destruction. */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); +int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context_ldt(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context(struct task_struct *tsk, - struct mm_struct *mm) +static inline int init_new_context_ldt(struct task_struct *tsk, + struct mm_struct *mm) { return 0; } -static inline void destroy_context(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) {} #endif static inline void load_mm_ldt(struct mm_struct *mm) @@ -102,6 +105,18 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); } +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + init_new_context_ldt(tsk, mm); + return 0; +} +static inline void destroy_context(struct mm_struct *mm) +{ + destroy_context_ldt(mm); +} + extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8911aecf035..caa00191e565 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -32,6 +32,15 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ +#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ + +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd @@ -45,6 +54,16 @@ #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) #define MSR_MTRRcap 0x000000fe + +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SSB_NO (1 << 4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e @@ -132,6 +151,7 @@ /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) @@ -308,6 +328,8 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8b910416243c..b4c74c24c890 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -6,6 +6,7 @@ #include <asm/alternative.h> #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> +#include <asm/msr-index.h> /* * Fill the CPU return stack buffer. @@ -171,6 +172,14 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* The Speculative Store Bypass disable variants */ +enum ssb_mitigation { + SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_DISABLE, + SPEC_STORE_BYPASS_PRCTL, + SPEC_STORE_BYPASS_SECCOMP, +}; + extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; @@ -194,6 +203,51 @@ static inline void vmexit_fill_RSB(void) #endif } +static __always_inline +void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) +{ + asm volatile(ALTERNATIVE("", "wrmsr", %c[feature]) + : : "c" (msr), + "a" ((u32)val), + "d" ((u32)(val >> 32)), + [feature] "i" (feature) + : "memory"); +} + +static inline void indirect_branch_prediction_barrier(void) +{ + u64 val = PRED_CMD_IBPB; + + alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); +} + +/* The Intel SPEC CTRL MSR base value cache */ +extern u64 x86_spec_ctrl_base; + +/* + * With retpoline, we must use IBRS to restrict branch prediction + * before calling into firmware. + * + * (Implemented as CPP macros due to header hell.) + */ +#define firmware_restrict_branch_speculation_start() \ +do { \ + u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ + \ + preempt_disable(); \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + X86_FEATURE_USE_IBRS_FW); \ +} while (0) + +#define firmware_restrict_branch_speculation_end() \ +do { \ + u64 val = x86_spec_ctrl_base; \ + \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + X86_FEATURE_USE_IBRS_FW); \ + preempt_enable(); \ +} while (0) + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fac9a5c0abe9..6847d85400a8 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -100,6 +100,7 @@ #define REQUIRED_MASK15 0 #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define REQUIRED_MASK18 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h new file mode 100644 index 000000000000..ae7c2c5cd7f0 --- /dev/null +++ b/arch/x86/include/asm/spec-ctrl.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SPECCTRL_H_ +#define _ASM_X86_SPECCTRL_H_ + +#include <linux/thread_info.h> +#include <asm/nospec-branch.h> + +/* + * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR + * the guest has, while on VMEXIT we restore the host view. This + * would be easier if SPEC_CTRL were architecturally maskable or + * shadowable for guests but this is not (currently) the case. + * Takes the guest view of SPEC_CTRL MSR as a parameter and also + * the guest's version of VIRT_SPEC_CTRL, if emulated. + */ +extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); + +/** + * x86_spec_ctrl_set_guest - Set speculation control registers for the guest + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); +} + +/** + * x86_spec_ctrl_restore_host - Restore host speculation control registers + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); +} + +/* AMD specific Speculative Store Bypass MSR data */ +extern u64 x86_amd_ls_cfg_base; +extern u64 x86_amd_ls_cfg_ssbd_mask; + +static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) +{ + return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; +} + +#ifdef CONFIG_SMP +extern void speculative_store_bypass_ht_init(void); +#else +static inline void speculative_store_bypass_ht_init(void) { } +#endif + +extern void speculative_store_bypass_update(unsigned long tif); + +static inline void speculative_store_bypass_update_current(void) +{ + speculative_store_bypass_update(current_thread_info()->flags); +} + +#endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 913468b17b9c..e8d453732823 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -92,6 +92,7 @@ struct thread_info { #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_SSBD 5 /* Reduced data speculation */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ @@ -114,8 +115,9 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -147,7 +149,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e2a89d2577fb..72cfe3e53af1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -68,6 +68,8 @@ static inline void invpcid_flush_all_nonglobals(void) struct tlb_state { struct mm_struct *active_mm; int state; + /* last user mm's ctx id */ + u64 last_ctx_id; /* * Access to this CR4 shadow and to H/W CR4 is protected by @@ -109,6 +111,16 @@ static inline void cr4_clear_bits(unsigned long mask) } } +static inline void cr4_toggle_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Read the CR4 shadow. */ static inline unsigned long cr4_read_shadow(void) { diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd22ef9a..5e59af41d40e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,6 +53,7 @@ obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-y += irqflags.o obj-y += process.o obj-y += fpu/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4fb8f5b0be4..9f6151884249 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/pci-direct.h> #include <asm/delay.h> @@ -519,6 +520,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); + + if (c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } } static void early_init_amd(struct cpuinfo_x86 *c) @@ -692,6 +713,17 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_ZEN); + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; @@ -722,6 +754,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ @@ -791,8 +824,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); - /* AMD CPUs don't reset SS attributes on SYSRET */ - set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2bbc74f8a4a8..12a8867071f3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,8 +11,10 @@ #include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/nospec.h> +#include <linux/prctl.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> @@ -26,6 +28,27 @@ #include <asm/intel-family.h> static void __init spectre_v2_select_mitigation(void); +static void __init ssb_select_mitigation(void); + +/* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any + * writes to SPEC_CTRL contain whatever reserved bits have been set. + */ +u64 x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* + * The vendor and possibly platform specific bits which can be modified in + * x86_spec_ctrl_base. + */ +static u64 x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 x86_amd_ls_cfg_base; +u64 x86_amd_ls_cfg_ssbd_mask; void __init check_bugs(void) { @@ -36,9 +59,27 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* Allow STIBP in MSR_SPEC_CTRL if supported */ + if (boot_cpu_has(X86_FEATURE_STIBP)) + x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* Select the proper spectre mitigation before patching alternatives */ spectre_v2_select_mitigation(); + /* + * Select proper mitigation for any exposure to the Speculative Store + * Bypass vulnerability. + */ + ssb_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -94,6 +135,73 @@ static const char *spectre_v2_strings[] = { static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 msrval, guestval, hostval = x86_spec_ctrl_base; + struct thread_info *ti = current_thread_info(); + + /* Is MSR_SPEC_CTRL implemented ? */ + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + /* + * Restrict guest_spec_ctrl to supported values. Clear the + * modifiable bits in the host base value and or the + * modifiable bits from the guest value. + */ + guestval = hostval & ~x86_spec_ctrl_mask; + guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + + /* SSBD controlled in MSR_SPEC_CTRL */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); + } + } + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculative_store_bypass_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -162,8 +270,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) return SPECTRE_V2_CMD_NONE; else { - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, - sizeof(arg)); + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_CMD_AUTO; @@ -184,8 +291,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -255,14 +361,14 @@ static void __init spectre_v2_select_mitigation(void) goto retpoline_auto; break; } - pr_err("kernel not compiled with retpoline; no mitigation available!"); + pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); return; retpoline_auto: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { retpoline_amd: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : @@ -280,7 +386,7 @@ retpoline_auto: pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP or KPTI are available, there is a risk of + * If neither SMEP nor PTI are available, there is a risk of * hitting userspace addresses in the RSB after a context switch * from a shallow call stack to a deeper one. To prevent this fill * the entire RSB, even when using IBRS. @@ -294,38 +400,309 @@ retpoline_auto: if ((!boot_cpu_has(X86_FEATURE_KAISER) && !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Filling RSB on context switch\n"); + pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); + } + + /* Initialize Indirect Branch Prediction Barrier if supported */ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); + } + + /* + * Retpoline means the kernel is safe because it has no indirect + * branches. But firmware isn't, so use IBRS to protect that. + */ + if (boot_cpu_has(X86_FEATURE_IBRS)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char *ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses + * a completely different MSR and bit dependent on family. + */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + break; + case X86_VENDOR_AMD: + x86_amd_ssb_disable(); + break; + } } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + bool update; + + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + default: + return -ERANGE; + } + + /* + * If being set on non-current task, delay setting the CPU + * mitigation until it is next scheduled. + */ + if (task == current && update) + speculative_store_bypass_update_current(); + + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, - struct device_attribute *attr, char *buf) + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) { - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); - if (boot_cpu_has(X86_FEATURE_KAISER)) - return sprintf(buf, "Mitigation: PTI\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_KAISER)) + return sprintf(buf, "Mitigation: PTI\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + + case X86_BUG_SPECTRE_V2: + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + spectre_v2_module_string()); + + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + + default: + break; + } + return sprintf(buf, "Vulnerable\n"); } -ssize_t cpu_show_spectre_v1(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) - return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); } -ssize_t cpu_show_spectre_v2(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - return sprintf(buf, "Not affected\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); +} - return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], - spectre_v2_module_string()); +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 736e2843139b..3d21b28f9826 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -43,6 +43,8 @@ #include <asm/pat.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> +#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -674,6 +676,40 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) } } +static void init_speculation_control(struct cpuinfo_x86 *c) +{ + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -695,6 +731,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; } /* Extended state features: level 0x0000000d */ @@ -765,6 +802,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + init_speculation_control(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -793,6 +838,75 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } +static const __initconst struct x86_cpu_id cpu_no_speculation[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_CENTAUR, 5 }, + { X86_VENDOR_INTEL, 5 }, + { X86_VENDOR_NSC, 5 }, + { X86_VENDOR_ANY, 4 }, + {} +}; + +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { + { X86_VENDOR_AMD }, + {} +}; + +static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + { X86_VENDOR_CENTAUR, 5, }, + { X86_VENDOR_INTEL, 5, }, + { X86_VENDOR_NSC, 5, }, + { X86_VENDOR_AMD, 0x12, }, + { X86_VENDOR_AMD, 0x11, }, + { X86_VENDOR_AMD, 0x10, }, + { X86_VENDOR_AMD, 0xf, }, + { X86_VENDOR_ANY, 4, }, + {} +}; + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = 0; + + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + if (!x86_match_cpu(cpu_no_spec_store_bypass) && + !(ia32_cap & ARCH_CAP_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (x86_match_cpu(cpu_no_speculation)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + + if (x86_match_cpu(cpu_no_meltdown)) + return; + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return; + + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -839,11 +953,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + cpu_set_bug_bits(c); fpu__init_system(c); @@ -1132,6 +1242,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) enable_sep_cpu(); #endif mtrr_ap_init(); + x86_spec_ctrl_setup_ap(); } struct msr_range { diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2584265d4745..3b19d82f7932 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -46,4 +46,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +extern void x86_spec_ctrl_setup_ap(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9299e3bdfad6..4dce22d3cb06 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> +#include <asm/intel-family.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -25,6 +26,62 @@ #include <asm/apic.h> #endif +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + /* + * We know that the hypervisor lie to us on the microcode version so + * we may as well hope that it is running the correct version. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return false; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_mask == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -51,6 +108,22 @@ static void early_init_intel(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_INTEL_STIBP) || + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + setup_clear_cpu_cap(X86_FEATURE_IBRS); + setup_clear_cpu_cap(X86_FEATURE_IBPB); + setup_clear_cpu_cap(X86_FEATURE_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ddc9b8125918..7b8c8c838191 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2294,9 +2294,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000000..3817eb748eb4 --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/asm.h> +#include <asm-generic/export.h> +#include <linux/linkage.h> + +/* + * unsigned long native_save_fl(void) + */ +ENTRY(native_save_fl) + pushf + pop %_ASM_AX + ret +ENDPROC(native_save_fl) +EXPORT_SYMBOL(native_save_fl) + +/* + * void native_restore_fl(unsigned long flags) + * %eax/%rdi: flags + */ +ENTRY(native_restore_fl) + push %_ASM_ARG1 + popf + ret +ENDPROC(native_restore_fl) +EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index bc429365b72a..8bc68cfc0d33 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -119,7 +119,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) * we do not have to muck with descriptors here, that is * done in switch_mm() as needed. */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) { struct ldt_struct *new_ldt; struct mm_struct *old_mm; @@ -160,7 +160,7 @@ out_unlock: * * 64bit: Don't touch the LDT register - we're already in the next thread. */ -void destroy_context(struct mm_struct *mm) +void destroy_context_ldt(struct mm_struct *mm) { free_ldt_struct(mm->context.ldt); mm->context.ldt = NULL; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2754dad26839..911ca11eb489 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -31,6 +31,7 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> +#include <asm/spec-ctrl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -116,11 +117,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -129,15 +125,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -146,7 +137,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -174,48 +165,199 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) { - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; - - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; - - update_debugctlmsr(debugctl); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } - - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themself to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +static __always_inline void intel_set_ssb_state(unsigned long tifn) +{ + u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); + + wrmsrl(MSR_IA32_SPEC_CTRL, msr); +} + +static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +{ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) + amd_set_ssb_virt_state(tifn); + else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + amd_set_core_ssb_state(tifn); + else + intel_set_ssb_state(tifn); +} + +void speculative_store_bypass_update(unsigned long tif) +{ + preempt_disable(); + __speculative_store_bypass_update(tif); + preempt_enable(); +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_SSBD) + __speculative_store_bypass_update(tifn); } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 1f7aefc7b0b4..c017f1c71560 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -75,6 +75,7 @@ #include <asm/i8259.h> #include <asm/realmode.h> #include <asm/misc.h> +#include <asm/spec-ctrl.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -217,6 +218,8 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); + speculative_store_bypass_ht_init(); + /* * Lock vector_lock and initialize the vectors on this cpu * before setting the cpu online. We must set it online with @@ -1209,6 +1212,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_mtrr_aps_delayed_init(); smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); } void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 42654375b73f..df7827a981dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -37,7 +37,7 @@ #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/kvm_para.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/virtext.h> #include "trace.h" diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 63c44a9bf6bb..18143886b186 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -48,7 +48,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include "trace.h" #include "pmu.h" diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 7cad01af6dcd..6d683bbb3502 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/nospec-branch.h> #include <asm/cache.h> #include <asm/apic.h> #include <asm/uv/uv.h> @@ -29,6 +30,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + struct flush_tlb_info { struct mm_struct *flush_mm; unsigned long flush_start; @@ -104,6 +107,36 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); + + /* + * Avoid user/user BTB poisoning by flushing the branch + * predictor when switching between processes. This stops + * one process from doing Spectre-v2 attacks on another. + * + * As an optimization, flush indirect branches only when + * switching into processes that disable dumping. This + * protects high value processes like gpg, without having + * too high performance overhead. IBPB is *expensive*! + * + * This will not flush branches when switching into kernel + * threads. It will also not flush if we switch to idle + * thread and back to the same process. It will flush if we + * switch to a different non-dumpable process. + */ + if (tsk && tsk->mm && + tsk->mm->context.ctx_id != last_ctx_id && + get_dumpable(tsk->mm) != SUID_DUMP_USER) + indirect_branch_prediction_barrier(); + + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); cpumask_set_cpu(cpu, mm_cpumask(next)); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a0ac0f9c307f..f5a8cd96bae4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -40,6 +40,7 @@ #include <asm/fixmap.h> #include <asm/realmode.h> #include <asm/time.h> +#include <asm/nospec-branch.h> /* * We allocate runtime services regions bottom-up, starting from -4G, i.e. @@ -347,6 +348,7 @@ extern efi_status_t efi64_thunk(u32, ...); \ efi_sync_low_kernel_mappings(); \ local_irq_save(flags); \ + firmware_restrict_branch_speculation_start(); \ \ efi_scratch.prev_cr3 = read_cr3(); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \ @@ -357,6 +359,7 @@ extern efi_status_t efi64_thunk(u32, ...); \ write_cr3(efi_scratch.prev_cr3); \ __flush_tlb_all(); \ + firmware_restrict_branch_speculation_end(); \ local_irq_restore(flags); \ \ __s; \ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cbef64b508e1..82fd84d5e1aa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -460,6 +460,12 @@ static void __init xen_init_cpuid_mask(void) cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); } +static void __init xen_init_capabilities(void) +{ + if (xen_pv_domain()) + setup_force_cpu_cap(X86_FEATURE_XENPV); +} + static void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); @@ -1587,6 +1593,7 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_init_irq_ops(); xen_init_cpuid_mask(); + xen_init_capabilities(); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -1883,14 +1890,6 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -static void xen_set_cpu_features(struct cpuinfo_x86 *c) -{ - if (xen_pv_domain()) { - clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - set_cpu_cap(c, X86_FEATURE_XENPV); - } -} - const struct hypervisor_x86 x86_hyper_xen = { .name = "Xen", .detect = xen_platform, @@ -1898,7 +1897,6 @@ const struct hypervisor_x86 x86_hyper_xen = { .init_platform = xen_hvm_guest_init, #endif .x2apic_available = xen_x2apic_para_available, - .set_cpu_features = xen_set_cpu_features, }; EXPORT_SYMBOL(x86_hyper_xen); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3f4ebf0261f2..29e50d1229bc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -28,6 +28,7 @@ #include <xen/interface/vcpu.h> #include <xen/interface/xenpmu.h> +#include <asm/spec-ctrl.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -87,6 +88,8 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); + speculative_store_bypass_ht_init(); + xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); @@ -357,6 +360,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + speculative_store_bypass_ht_init(); + xen_pmu_init(0); if (xen_smp_intr_init(0)) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7f664c416faf..4ecd0de08557 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,11 +1,14 @@ #include <linux/types.h> #include <linux/tick.h> +#include <linux/percpu-defs.h> #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> #include <xen/events.h> +#include <asm/cpufeatures.h> +#include <asm/msr-index.h> #include <asm/xen/hypercall.h> #include <asm/xen/page.h> #include <asm/fixmap.h> @@ -68,6 +71,8 @@ static void xen_pv_post_suspend(int suspend_cancelled) xen_mm_unpin_all(); } +static DEFINE_PER_CPU(u64, spec_ctrl); + void xen_arch_pre_suspend(void) { if (xen_pv_domain()) @@ -84,6 +89,9 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { + if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); + /* Boot processor notified via generic timekeeping_resume() */ if (smp_processor_id() == 0) return; @@ -93,7 +101,15 @@ static void xen_vcpu_notify_restore(void *data) static void xen_vcpu_notify_suspend(void *data) { + u64 tmp; + tick_suspend_local(); + + if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + rdmsrl(MSR_IA32_SPEC_CTRL, tmp); + this_cpu_write(spec_ctrl, tmp); + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + } } void xen_arch_resume(void) diff --git a/block/blk-core.c b/block/blk-core.c index 3b2d557ff7c7..e5f8a1482eb7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -661,21 +661,17 @@ EXPORT_SYMBOL(blk_alloc_queue); int blk_queue_enter(struct request_queue *q, gfp_t gfp) { while (true) { - int ret; - if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; if (!gfpflags_allow_blocking(gfp)) return -EBUSY; - ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || - blk_queue_dying(q)); + wait_event(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; - if (ret) - return ret; } } diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index c302f47f6323..94712e1c5cf9 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -1481,6 +1481,8 @@ static int zatm_ioctl(struct atm_dev *dev,unsigned int cmd,void __user *arg) return -EFAULT; if (pool < 0 || pool > ZATM_LAST_POOL) return -EINVAL; + pool = array_index_nospec(pool, + ZATM_LAST_POOL + 1); if (copy_from_user(&info, &((struct zatm_pool_req __user *) arg)->info, sizeof(info))) return -EFAULT; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index cb5c54e258eb..21044b1f29bc 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -693,14 +693,22 @@ ssize_t __weak cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); +static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, &dev_attr_spectre_v1.attr, &dev_attr_spectre_v2.attr, + &dev_attr_spec_store_bypass.attr, NULL }; diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 7e00d10d5a94..0ab3a8d36ad4 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -330,14 +330,6 @@ static int really_probe(struct device *dev, struct device_driver *drv) goto probe_failed; } - /* - * Ensure devices are listed in devices_kset in correct order - * It's important to move Dev to the end of devices_kset before - * calling .probe, because it could be recursive and parent Dev - * should always go first - */ - devices_kset_move_last(dev); - if (dev->bus->probe) { ret = dev->bus->probe(dev); if (ret) diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c index 8c41c6fcb9ee..acf83569f86f 100644 --- a/drivers/clk/tegra/clk-tegra30.c +++ b/drivers/clk/tegra/clk-tegra30.c @@ -333,11 +333,11 @@ static struct pdiv_map pllu_p[] = { }; static struct tegra_clk_pll_freq_table pll_u_freq_table[] = { - { 12000000, 480000000, 960, 12, 0, 12}, - { 13000000, 480000000, 960, 13, 0, 12}, - { 16800000, 480000000, 400, 7, 0, 5}, - { 19200000, 480000000, 200, 4, 0, 3}, - { 26000000, 480000000, 960, 26, 0, 12}, + { 12000000, 480000000, 960, 12, 2, 12 }, + { 13000000, 480000000, 960, 13, 2, 12 }, + { 16800000, 480000000, 400, 7, 2, 5 }, + { 19200000, 480000000, 200, 4, 2, 3 }, + { 26000000, 480000000, 960, 26, 2, 12 }, { 0, 0, 0, 0, 0, 0 }, }; @@ -1372,6 +1372,7 @@ static struct tegra_clk_init_table init_table[] __initdata = { {TEGRA30_CLK_GR2D, TEGRA30_CLK_PLL_C, 300000000, 0}, {TEGRA30_CLK_GR3D, TEGRA30_CLK_PLL_C, 300000000, 0}, {TEGRA30_CLK_GR3D2, TEGRA30_CLK_PLL_C, 300000000, 0}, + { TEGRA30_CLK_PLL_U, TEGRA30_CLK_CLK_MAX, 480000000, 0 }, {TEGRA30_CLK_CLK_MAX, TEGRA30_CLK_CLK_MAX, 0, 0}, /* This MUST be the last entry. */ }; diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c index 58a630e55d5d..78d0722feacb 100644 --- a/drivers/crypto/amcc/crypto4xx_core.c +++ b/drivers/crypto/amcc/crypto4xx_core.c @@ -207,7 +207,7 @@ static u32 crypto4xx_build_pdr(struct crypto4xx_device *dev) dev->pdr_pa); return -ENOMEM; } - memset(dev->pdr, 0, sizeof(struct ce_pd) * PPC4XX_NUM_PD); + memset(dev->pdr, 0, sizeof(struct ce_pd) * PPC4XX_NUM_PD); dev->shadow_sa_pool = dma_alloc_coherent(dev->core_dev->device, 256 * PPC4XX_NUM_PD, &dev->shadow_sa_pool_pa, @@ -240,13 +240,15 @@ static u32 crypto4xx_build_pdr(struct crypto4xx_device *dev) static void crypto4xx_destroy_pdr(struct crypto4xx_device *dev) { - if (dev->pdr != NULL) + if (dev->pdr) dma_free_coherent(dev->core_dev->device, sizeof(struct ce_pd) * PPC4XX_NUM_PD, dev->pdr, dev->pdr_pa); + if (dev->shadow_sa_pool) dma_free_coherent(dev->core_dev->device, 256 * PPC4XX_NUM_PD, dev->shadow_sa_pool, dev->shadow_sa_pool_pa); + if (dev->shadow_sr_pool) dma_free_coherent(dev->core_dev->device, sizeof(struct sa_state_record) * PPC4XX_NUM_PD, @@ -416,12 +418,12 @@ static u32 crypto4xx_build_sdr(struct crypto4xx_device *dev) static void crypto4xx_destroy_sdr(struct crypto4xx_device *dev) { - if (dev->sdr != NULL) + if (dev->sdr) dma_free_coherent(dev->core_dev->device, sizeof(struct ce_sd) * PPC4XX_NUM_SD, dev->sdr, dev->sdr_pa); - if (dev->scatter_buffer_va != NULL) + if (dev->scatter_buffer_va) dma_free_coherent(dev->core_dev->device, dev->scatter_buffer_size * PPC4XX_NUM_SD, dev->scatter_buffer_va, @@ -1029,12 +1031,10 @@ int crypto4xx_register_alg(struct crypto4xx_device *sec_dev, break; } - if (rc) { - list_del(&alg->entry); + if (rc) kfree(alg); - } else { + else list_add_tail(&alg->entry, &sec_dev->alg_list); - } } return 0; @@ -1188,7 +1188,7 @@ static int crypto4xx_probe(struct platform_device *ofdev) rc = crypto4xx_build_gdr(core_dev->dev); if (rc) - goto err_build_gdr; + goto err_build_pdr; rc = crypto4xx_build_sdr(core_dev->dev); if (rc) @@ -1230,12 +1230,11 @@ err_iomap: err_request_irq: irq_dispose_mapping(core_dev->irq); tasklet_kill(&core_dev->tasklet); - crypto4xx_destroy_sdr(core_dev->dev); err_build_sdr: + crypto4xx_destroy_sdr(core_dev->dev); crypto4xx_destroy_gdr(core_dev->dev); -err_build_gdr: - crypto4xx_destroy_pdr(core_dev->dev); err_build_pdr: + crypto4xx_destroy_pdr(core_dev->dev); kfree(core_dev->dev); err_alloc_dev: kfree(core_dev); diff --git a/drivers/mtd/ubi/attach.c b/drivers/mtd/ubi/attach.c index c1aaf0336cf2..5cde3ad1665e 100644 --- a/drivers/mtd/ubi/attach.c +++ b/drivers/mtd/ubi/attach.c @@ -175,6 +175,40 @@ static int add_corrupted(struct ubi_attach_info *ai, int pnum, int ec) } /** + * add_fastmap - add a Fastmap related physical eraseblock. + * @ai: attaching information + * @pnum: physical eraseblock number the VID header came from + * @vid_hdr: the volume identifier header + * @ec: erase counter of the physical eraseblock + * + * This function allocates a 'struct ubi_ainf_peb' object for a Fastamp + * physical eraseblock @pnum and adds it to the 'fastmap' list. + * Such blocks can be Fastmap super and data blocks from both the most + * recent Fastmap we're attaching from or from old Fastmaps which will + * be erased. + */ +static int add_fastmap(struct ubi_attach_info *ai, int pnum, + struct ubi_vid_hdr *vid_hdr, int ec) +{ + struct ubi_ainf_peb *aeb; + + aeb = kmem_cache_alloc(ai->aeb_slab_cache, GFP_KERNEL); + if (!aeb) + return -ENOMEM; + + aeb->pnum = pnum; + aeb->vol_id = be32_to_cpu(vidh->vol_id); + aeb->sqnum = be64_to_cpu(vidh->sqnum); + aeb->ec = ec; + list_add(&aeb->u.list, &ai->fastmap); + + dbg_bld("add to fastmap list: PEB %d, vol_id %d, sqnum: %llu", pnum, + aeb->vol_id, aeb->sqnum); + + return 0; +} + +/** * validate_vid_hdr - check volume identifier header. * @ubi: UBI device description object * @vid_hdr: the volume identifier header to check @@ -803,13 +837,26 @@ out_unlock: return err; } +static bool vol_ignored(int vol_id) +{ + switch (vol_id) { + case UBI_LAYOUT_VOLUME_ID: + return true; + } + +#ifdef CONFIG_MTD_UBI_FASTMAP + return ubi_is_fm_vol(vol_id); +#else + return false; +#endif +} + /** * scan_peb - scan and process UBI headers of a PEB. * @ubi: UBI device description object * @ai: attaching information * @pnum: the physical eraseblock number - * @vid: The volume ID of the found volume will be stored in this pointer - * @sqnum: The sqnum of the found volume will be stored in this pointer + * @fast: true if we're scanning for a Fastmap * * This function reads UBI headers of PEB @pnum, checks them, and adds * information about this PEB to the corresponding list or RB-tree in the @@ -817,9 +864,9 @@ out_unlock: * successfully handled and a negative error code in case of failure. */ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, - int pnum, int *vid, unsigned long long *sqnum) + int pnum, bool fast) { - long long uninitialized_var(ec); + long long ec; int err, bitflips = 0, vol_id = -1, ec_err = 0; dbg_bld("scan PEB %d", pnum); @@ -935,6 +982,20 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, */ ai->maybe_bad_peb_count += 1; case UBI_IO_BAD_HDR: + /* + * If we're facing a bad VID header we have to drop *all* + * Fastmap data structures we find. The most recent Fastmap + * could be bad and therefore there is a chance that we attach + * from an old one. On a fine MTD stack a PEB must not render + * bad all of a sudden, but the reality is different. + * So, let's be paranoid and help finding the root cause by + * falling back to scanning mode instead of attaching with a + * bad EBA table and cause data corruption which is hard to + * analyze. + */ + if (fast) + ai->force_full_scan = 1; + if (ec_err) /* * Both headers are corrupted. There is a possibility @@ -991,21 +1052,15 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, } vol_id = be32_to_cpu(vidh->vol_id); - if (vid) - *vid = vol_id; - if (sqnum) - *sqnum = be64_to_cpu(vidh->sqnum); - if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOLUME_ID) { + if (vol_id > UBI_MAX_VOLUMES && !vol_ignored(vol_id)) { int lnum = be32_to_cpu(vidh->lnum); /* Unsupported internal volume */ switch (vidh->compat) { case UBI_COMPAT_DELETE: - if (vol_id != UBI_FM_SB_VOLUME_ID - && vol_id != UBI_FM_DATA_VOLUME_ID) { - ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it", - vol_id, lnum); - } + ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it", + vol_id, lnum); + err = add_to_list(ai, pnum, vol_id, lnum, ec, 1, &ai->erase); if (err) @@ -1037,7 +1092,12 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, if (ec_err) ubi_warn(ubi, "valid VID header but corrupted EC header at PEB %d", pnum); - err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips); + + if (ubi_is_fm_vol(vol_id)) + err = add_fastmap(ai, pnum, vidh, ec); + else + err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips); + if (err) return err; @@ -1186,6 +1246,10 @@ static void destroy_ai(struct ubi_attach_info *ai) list_del(&aeb->u.list); kmem_cache_free(ai->aeb_slab_cache, aeb); } + list_for_each_entry_safe(aeb, aeb_tmp, &ai->fastmap, u.list) { + list_del(&aeb->u.list); + kmem_cache_free(ai->aeb_slab_cache, aeb); + } /* Destroy the volume RB-tree */ rb = ai->volumes.rb_node; @@ -1245,7 +1309,7 @@ static int scan_all(struct ubi_device *ubi, struct ubi_attach_info *ai, cond_resched(); dbg_gen("process PEB %d", pnum); - err = scan_peb(ubi, ai, pnum, NULL, NULL); + err = scan_peb(ubi, ai, pnum, false); if (err < 0) goto out_vidh; } @@ -1311,6 +1375,7 @@ static struct ubi_attach_info *alloc_ai(void) INIT_LIST_HEAD(&ai->free); INIT_LIST_HEAD(&ai->erase); INIT_LIST_HEAD(&ai->alien); + INIT_LIST_HEAD(&ai->fastmap); ai->volumes = RB_ROOT; ai->aeb_slab_cache = kmem_cache_create("ubi_aeb_slab_cache", sizeof(struct ubi_ainf_peb), @@ -1337,52 +1402,58 @@ static struct ubi_attach_info *alloc_ai(void) */ static int scan_fast(struct ubi_device *ubi, struct ubi_attach_info **ai) { - int err, pnum, fm_anchor = -1; - unsigned long long max_sqnum = 0; + int err, pnum; + struct ubi_attach_info *scan_ai; err = -ENOMEM; + scan_ai = alloc_ai(); + if (!scan_ai) + goto out; + ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); if (!ech) - goto out; + goto out_ai; vidh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL); if (!vidh) goto out_ech; for (pnum = 0; pnum < UBI_FM_MAX_START; pnum++) { - int vol_id = -1; - unsigned long long sqnum = -1; cond_resched(); dbg_gen("process PEB %d", pnum); - err = scan_peb(ubi, *ai, pnum, &vol_id, &sqnum); + err = scan_peb(ubi, scan_ai, pnum, true); if (err < 0) goto out_vidh; - - if (vol_id == UBI_FM_SB_VOLUME_ID && sqnum > max_sqnum) { - max_sqnum = sqnum; - fm_anchor = pnum; - } } ubi_free_vid_hdr(ubi, vidh); kfree(ech); - if (fm_anchor < 0) - return UBI_NO_FASTMAP; + if (scan_ai->force_full_scan) + err = UBI_NO_FASTMAP; + else + err = ubi_scan_fastmap(ubi, *ai, scan_ai); - destroy_ai(*ai); - *ai = alloc_ai(); - if (!*ai) - return -ENOMEM; + if (err) { + /* + * Didn't attach via fastmap, do a full scan but reuse what + * we've aready scanned. + */ + destroy_ai(*ai); + *ai = scan_ai; + } else + destroy_ai(scan_ai); - return ubi_scan_fastmap(ubi, *ai, fm_anchor); + return err; out_vidh: ubi_free_vid_hdr(ubi, vidh); out_ech: kfree(ech); +out_ai: + destroy_ai(scan_ai); out: return err; } diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index c4a25c858c07..03cf0553ec1b 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1178,6 +1178,8 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, struct ubi_volume *vol; uint32_t crc; + ubi_assert(rwsem_is_locked(&ubi->fm_eba_sem)); + vol_id = be32_to_cpu(vid_hdr->vol_id); lnum = be32_to_cpu(vid_hdr->lnum); @@ -1346,9 +1348,7 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, } ubi_assert(vol->eba_tbl[lnum] == from); - down_read(&ubi->fm_eba_sem); vol->eba_tbl[lnum] = to; - up_read(&ubi->fm_eba_sem); out_unlock_buf: mutex_unlock(&ubi->buf_mutex); diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c index ed62f1efe6eb..69dd21679a30 100644 --- a/drivers/mtd/ubi/fastmap-wl.c +++ b/drivers/mtd/ubi/fastmap-wl.c @@ -262,6 +262,8 @@ static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi) struct ubi_fm_pool *pool = &ubi->fm_wl_pool; int pnum; + ubi_assert(rwsem_is_locked(&ubi->fm_eba_sem)); + if (pool->used == pool->size) { /* We cannot update the fastmap here because this * function is called in atomic context. @@ -303,7 +305,7 @@ int ubi_ensure_anchor_pebs(struct ubi_device *ubi) wrk->anchor = 1; wrk->func = &wear_leveling_worker; - schedule_ubi_work(ubi, wrk); + __schedule_ubi_work(ubi, wrk); return 0; } @@ -344,7 +346,7 @@ int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *fm_e, spin_unlock(&ubi->wl_lock); vol_id = lnum ? UBI_FM_DATA_VOLUME_ID : UBI_FM_SB_VOLUME_ID; - return schedule_erase(ubi, e, vol_id, lnum, torture); + return schedule_erase(ubi, e, vol_id, lnum, torture, true); } /** diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index bba7dd1b5ebf..72e89b352034 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -326,6 +326,7 @@ static int update_vol(struct ubi_device *ubi, struct ubi_attach_info *ai, aeb->pnum = new_aeb->pnum; aeb->copy_flag = new_vh->copy_flag; aeb->scrub = new_aeb->scrub; + aeb->sqnum = new_aeb->sqnum; kmem_cache_free(ai->aeb_slab_cache, new_aeb); /* new_aeb is older */ @@ -851,27 +852,57 @@ fail: } /** + * find_fm_anchor - find the most recent Fastmap superblock (anchor) + * @ai: UBI attach info to be filled + */ +static int find_fm_anchor(struct ubi_attach_info *ai) +{ + int ret = -1; + struct ubi_ainf_peb *aeb; + unsigned long long max_sqnum = 0; + + list_for_each_entry(aeb, &ai->fastmap, u.list) { + if (aeb->vol_id == UBI_FM_SB_VOLUME_ID && aeb->sqnum > max_sqnum) { + max_sqnum = aeb->sqnum; + ret = aeb->pnum; + } + } + + return ret; +} + +/** * ubi_scan_fastmap - scan the fastmap. * @ubi: UBI device object * @ai: UBI attach info to be filled - * @fm_anchor: The fastmap starts at this PEB + * @scan_ai: UBI attach info from the first 64 PEBs, + * used to find the most recent Fastmap data structure * * Returns 0 on success, UBI_NO_FASTMAP if no fastmap was found, * UBI_BAD_FASTMAP if one was found but is not usable. * < 0 indicates an internal error. */ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, - int fm_anchor) + struct ubi_attach_info *scan_ai) { struct ubi_fm_sb *fmsb, *fmsb2; struct ubi_vid_hdr *vh; struct ubi_ec_hdr *ech; struct ubi_fastmap_layout *fm; - int i, used_blocks, pnum, ret = 0; + struct ubi_ainf_peb *tmp_aeb, *aeb; + int i, used_blocks, pnum, fm_anchor, ret = 0; size_t fm_size; __be32 crc, tmp_crc; unsigned long long sqnum = 0; + fm_anchor = find_fm_anchor(scan_ai); + if (fm_anchor < 0) + return UBI_NO_FASTMAP; + + /* Move all (possible) fastmap blocks into our new attach structure. */ + list_for_each_entry_safe(aeb, tmp_aeb, &scan_ai->fastmap, u.list) + list_move_tail(&aeb->u.list, &ai->fastmap); + down_write(&ubi->fm_protect); memset(ubi->fm_buf, 0, ubi->fm_size); @@ -1484,22 +1515,30 @@ int ubi_update_fastmap(struct ubi_device *ubi) struct ubi_wl_entry *tmp_e; down_write(&ubi->fm_protect); + down_write(&ubi->work_sem); + down_write(&ubi->fm_eba_sem); ubi_refill_pools(ubi); if (ubi->ro_mode || ubi->fm_disabled) { + up_write(&ubi->fm_eba_sem); + up_write(&ubi->work_sem); up_write(&ubi->fm_protect); return 0; } ret = ubi_ensure_anchor_pebs(ubi); if (ret) { + up_write(&ubi->fm_eba_sem); + up_write(&ubi->work_sem); up_write(&ubi->fm_protect); return ret; } new_fm = kzalloc(sizeof(*new_fm), GFP_KERNEL); if (!new_fm) { + up_write(&ubi->fm_eba_sem); + up_write(&ubi->work_sem); up_write(&ubi->fm_protect); return -ENOMEM; } @@ -1608,16 +1647,14 @@ int ubi_update_fastmap(struct ubi_device *ubi) new_fm->e[0] = tmp_e; } - down_write(&ubi->work_sem); - down_write(&ubi->fm_eba_sem); ret = ubi_write_fastmap(ubi, new_fm); - up_write(&ubi->fm_eba_sem); - up_write(&ubi->work_sem); if (ret) goto err; out_unlock: + up_write(&ubi->fm_eba_sem); + up_write(&ubi->work_sem); up_write(&ubi->fm_protect); kfree(old_fm); return ret; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index bdb885d9d3fc..61f039d3289e 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -705,6 +705,8 @@ struct ubi_ainf_volume { * @erase: list of physical eraseblocks which have to be erased * @alien: list of physical eraseblocks which should not be used by UBI (e.g., * those belonging to "preserve"-compatible internal volumes) + * @fastmap: list of physical eraseblocks which relate to fastmap (e.g., + * eraseblocks of the current and not yet erased old fastmap blocks) * @corr_peb_count: count of PEBs in the @corr list * @empty_peb_count: count of PEBs which are presumably empty (contain only * 0xFF bytes) @@ -715,6 +717,8 @@ struct ubi_ainf_volume { * @vols_found: number of volumes found * @highest_vol_id: highest volume ID * @is_empty: flag indicating whether the MTD device is empty or not + * @force_full_scan: flag indicating whether we need to do a full scan and drop + all existing Fastmap data structures * @min_ec: lowest erase counter value * @max_ec: highest erase counter value * @max_sqnum: highest sequence number value @@ -733,6 +737,7 @@ struct ubi_attach_info { struct list_head free; struct list_head erase; struct list_head alien; + struct list_head fastmap; int corr_peb_count; int empty_peb_count; int alien_peb_count; @@ -741,6 +746,7 @@ struct ubi_attach_info { int vols_found; int highest_vol_id; int is_empty; + int force_full_scan; int min_ec; int max_ec; unsigned long long max_sqnum; @@ -919,7 +925,7 @@ int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb, size_t ubi_calc_fm_size(struct ubi_device *ubi); int ubi_update_fastmap(struct ubi_device *ubi); int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, - int fm_anchor); + struct ubi_attach_info *scan_ai); #else static inline int ubi_update_fastmap(struct ubi_device *ubi) { return 0; } #endif @@ -1113,4 +1119,42 @@ static inline int idx2vol_id(const struct ubi_device *ubi, int idx) return idx; } +/** + * ubi_is_fm_vol - check whether a volume ID is a Fastmap volume. + * @vol_id: volume ID + */ +static inline bool ubi_is_fm_vol(int vol_id) +{ + switch (vol_id) { + case UBI_FM_SB_VOLUME_ID: + case UBI_FM_DATA_VOLUME_ID: + return true; + } + + return false; +} + +/** + * ubi_find_fm_block - check whether a PEB is part of the current Fastmap. + * @ubi: UBI device description object + * @pnum: physical eraseblock to look for + * + * This function returns a wear leveling object if @pnum relates to the current + * fastmap, @NULL otherwise. + */ +static inline struct ubi_wl_entry *ubi_find_fm_block(const struct ubi_device *ubi, + int pnum) +{ + int i; + + if (ubi->fm) { + for (i = 0; i < ubi->fm->used_blocks; i++) { + if (ubi->fm->e[i]->pnum == pnum) + return ubi->fm->e[i]; + } + } + + return NULL; +} + #endif /* !__UBI_UBI_H__ */ diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 5e65ab837e99..ffcb64bc2bf8 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -599,7 +599,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, * failure. */ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, - int vol_id, int lnum, int torture) + int vol_id, int lnum, int torture, bool nested) { struct ubi_work *wl_wrk; @@ -618,7 +618,10 @@ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, wl_wrk->lnum = lnum; wl_wrk->torture = torture; - schedule_ubi_work(ubi, wl_wrk); + if (nested) + __schedule_ubi_work(ubi, wl_wrk); + else + schedule_ubi_work(ubi, wl_wrk); return 0; } @@ -679,6 +682,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, if (!vid_hdr) return -ENOMEM; + down_read(&ubi->fm_eba_sem); mutex_lock(&ubi->move_mutex); spin_lock(&ubi->wl_lock); ubi_assert(!ubi->move_from && !ubi->move_to); @@ -919,6 +923,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, dbg_wl("done"); mutex_unlock(&ubi->move_mutex); + up_read(&ubi->fm_eba_sem); return 0; /* @@ -969,6 +974,7 @@ out_not_moved: } mutex_unlock(&ubi->move_mutex); + up_read(&ubi->fm_eba_sem); return 0; out_error: @@ -990,6 +996,7 @@ out_error: out_ro: ubi_ro_mode(ubi); mutex_unlock(&ubi->move_mutex); + up_read(&ubi->fm_eba_sem); ubi_assert(err != 0); return err < 0 ? err : -EIO; @@ -997,6 +1004,7 @@ out_cancel: ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); mutex_unlock(&ubi->move_mutex); + up_read(&ubi->fm_eba_sem); ubi_free_vid_hdr(ubi, vid_hdr); return 0; } @@ -1119,7 +1127,7 @@ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) int err1; /* Re-schedule the LEB for erasure */ - err1 = schedule_erase(ubi, e, vol_id, lnum, 0); + err1 = schedule_erase(ubi, e, vol_id, lnum, 0, false); if (err1) { wl_entry_destroy(ubi, e); err = err1; @@ -1315,7 +1323,7 @@ retry: } spin_unlock(&ubi->wl_lock); - err = schedule_erase(ubi, e, vol_id, lnum, torture); + err = schedule_erase(ubi, e, vol_id, lnum, torture, false); if (err) { spin_lock(&ubi->wl_lock); wl_tree_add(e, &ubi->used); @@ -1751,6 +1759,48 @@ static void shutdown_work(struct ubi_device *ubi) } /** + * erase_aeb - erase a PEB given in UBI attach info PEB + * @ubi: UBI device description object + * @aeb: UBI attach info PEB + * @sync: If true, erase synchronously. Otherwise schedule for erasure + */ +static int erase_aeb(struct ubi_device *ubi, struct ubi_ainf_peb *aeb, bool sync) +{ + struct ubi_wl_entry *e; + int err; + + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->pnum = aeb->pnum; + e->ec = aeb->ec; + e->tagged_scrub_all = 0; + e->sqnum = aeb->sqnum; + ubi->lookuptbl[e->pnum] = e; + + if (sync) { + err = sync_erase(ubi, e, false); + if (err) + goto out_free; + + wl_tree_add(e, &ubi->free); + ubi->free_count++; + } else { + err = schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0, false); + if (err) + goto out_free; + } + + return 0; + +out_free: + wl_entry_destroy(ubi, e); + + return err; +} + +/** * ubi_wl_init - initialize the WL sub-system using attaching information. * @ubi: UBI device description object * @ai: attaching information @@ -1787,19 +1837,9 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) { cond_resched(); - e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); - if (!e) - goto out_free; - - e->pnum = aeb->pnum; - e->ec = aeb->ec; - e->tagged_scrub_all = 0; - e->sqnum = aeb->sqnum; - ubi->lookuptbl[e->pnum] = e; - if (schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0)) { - wl_entry_destroy(ubi, e); + err = erase_aeb(ubi, aeb, false); + if (err) goto out_free; - } found_pebs++; } @@ -1854,19 +1894,49 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) } } - dbg_wl("found %i PEBs", found_pebs); + list_for_each_entry(aeb, &ai->fastmap, u.list) { + cond_resched(); - if (ubi->fm) { - ubi_assert(ubi->good_peb_count == - found_pebs + ubi->fm->used_blocks); + e = ubi_find_fm_block(ubi, aeb->pnum); - for (i = 0; i < ubi->fm->used_blocks; i++) { - e = ubi->fm->e[i]; + if (e) { + ubi_assert(!ubi->lookuptbl[e->pnum]); ubi->lookuptbl[e->pnum] = e; + } else { + bool sync = false; + + /* + * Usually old Fastmap PEBs are scheduled for erasure + * and we don't have to care about them but if we face + * an power cut before scheduling them we need to + * take care of them here. + */ + if (ubi->lookuptbl[aeb->pnum]) + continue; + + /* + * The fastmap update code might not find a free PEB for + * writing the fastmap anchor to and then reuses the + * current fastmap anchor PEB. When this PEB gets erased + * and a power cut happens before it is written again we + * must make sure that the fastmap attach code doesn't + * find any outdated fastmap anchors, hence we erase the + * outdated fastmap anchor PEBs synchronously here. + */ + if (aeb->vol_id == UBI_FM_SB_VOLUME_ID) + sync = true; + + err = erase_aeb(ubi, aeb, sync); + if (err) + goto out_free; } + + found_pebs++; } - else - ubi_assert(ubi->good_peb_count == found_pebs); + + dbg_wl("found %i PEBs", found_pebs); + + ubi_assert(ubi->good_peb_count == found_pebs); reserved_pebs = WL_RESERVED_PEBS; ubi_fastmap_init(ubi, &reserved_pebs); diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 51670b322409..700b98d9c250 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -2,6 +2,7 @@ * * Copyright (C) 2012 - 2014 Xilinx, Inc. * Copyright (C) 2009 PetaLogix. All rights reserved. + * Copyright (C) 2017 Sandvik Mining and Construction Oy * * Description: * This driver is developed for Axi CAN IP and for Zynq CANPS Controller. @@ -25,8 +26,10 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/of.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/skbuff.h> +#include <linux/spinlock.h> #include <linux/string.h> #include <linux/types.h> #include <linux/can/dev.h> @@ -100,7 +103,7 @@ enum xcan_reg { #define XCAN_INTR_ALL (XCAN_IXR_TXOK_MASK | XCAN_IXR_BSOFF_MASK |\ XCAN_IXR_WKUP_MASK | XCAN_IXR_SLP_MASK | \ XCAN_IXR_RXNEMP_MASK | XCAN_IXR_ERROR_MASK | \ - XCAN_IXR_ARBLST_MASK | XCAN_IXR_RXOK_MASK) + XCAN_IXR_RXOFLW_MASK | XCAN_IXR_ARBLST_MASK) /* CAN register bit shift - XCAN_<REG>_<BIT>_SHIFT */ #define XCAN_BTR_SJW_SHIFT 7 /* Synchronous jump width */ @@ -117,6 +120,7 @@ enum xcan_reg { /** * struct xcan_priv - This definition define CAN driver instance * @can: CAN private data structure. + * @tx_lock: Lock for synchronizing TX interrupt handling * @tx_head: Tx CAN packets ready to send on the queue * @tx_tail: Tx CAN packets successfully sended on the queue * @tx_max: Maximum number packets the driver can send @@ -131,6 +135,7 @@ enum xcan_reg { */ struct xcan_priv { struct can_priv can; + spinlock_t tx_lock; unsigned int tx_head; unsigned int tx_tail; unsigned int tx_max; @@ -158,6 +163,11 @@ static const struct can_bittiming_const xcan_bittiming_const = { .brp_inc = 1, }; +#define XCAN_CAP_WATERMARK 0x0001 +struct xcan_devtype_data { + unsigned int caps; +}; + /** * xcan_write_reg_le - Write a value to the device register little endian * @priv: Driver private data structure @@ -237,6 +247,10 @@ static int set_reset_mode(struct net_device *ndev) usleep_range(500, 10000); } + /* reset clears FIFOs */ + priv->tx_head = 0; + priv->tx_tail = 0; + return 0; } @@ -391,6 +405,7 @@ static int xcan_start_xmit(struct sk_buff *skb, struct net_device *ndev) struct net_device_stats *stats = &ndev->stats; struct can_frame *cf = (struct can_frame *)skb->data; u32 id, dlc, data[2] = {0, 0}; + unsigned long flags; if (can_dropped_invalid_skb(ndev, skb)) return NETDEV_TX_OK; @@ -438,6 +453,9 @@ static int xcan_start_xmit(struct sk_buff *skb, struct net_device *ndev) data[1] = be32_to_cpup((__be32 *)(cf->data + 4)); can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max); + + spin_lock_irqsave(&priv->tx_lock, flags); + priv->tx_head++; /* Write the Frame to Xilinx CAN TX FIFO */ @@ -453,10 +471,16 @@ static int xcan_start_xmit(struct sk_buff *skb, struct net_device *ndev) stats->tx_bytes += cf->can_dlc; } + /* Clear TX-FIFO-empty interrupt for xcan_tx_interrupt() */ + if (priv->tx_max > 1) + priv->write_reg(priv, XCAN_ICR_OFFSET, XCAN_IXR_TXFEMP_MASK); + /* Check if the TX buffer is full */ if ((priv->tx_head - priv->tx_tail) == priv->tx_max) netif_stop_queue(ndev); + spin_unlock_irqrestore(&priv->tx_lock, flags); + return NETDEV_TX_OK; } @@ -529,6 +553,123 @@ static int xcan_rx(struct net_device *ndev) } /** + * xcan_current_error_state - Get current error state from HW + * @ndev: Pointer to net_device structure + * + * Checks the current CAN error state from the HW. Note that this + * only checks for ERROR_PASSIVE and ERROR_WARNING. + * + * Return: + * ERROR_PASSIVE or ERROR_WARNING if either is active, ERROR_ACTIVE + * otherwise. + */ +static enum can_state xcan_current_error_state(struct net_device *ndev) +{ + struct xcan_priv *priv = netdev_priv(ndev); + u32 status = priv->read_reg(priv, XCAN_SR_OFFSET); + + if ((status & XCAN_SR_ESTAT_MASK) == XCAN_SR_ESTAT_MASK) + return CAN_STATE_ERROR_PASSIVE; + else if (status & XCAN_SR_ERRWRN_MASK) + return CAN_STATE_ERROR_WARNING; + else + return CAN_STATE_ERROR_ACTIVE; +} + +/** + * xcan_set_error_state - Set new CAN error state + * @ndev: Pointer to net_device structure + * @new_state: The new CAN state to be set + * @cf: Error frame to be populated or NULL + * + * Set new CAN error state for the device, updating statistics and + * populating the error frame if given. + */ +static void xcan_set_error_state(struct net_device *ndev, + enum can_state new_state, + struct can_frame *cf) +{ + struct xcan_priv *priv = netdev_priv(ndev); + u32 ecr = priv->read_reg(priv, XCAN_ECR_OFFSET); + u32 txerr = ecr & XCAN_ECR_TEC_MASK; + u32 rxerr = (ecr & XCAN_ECR_REC_MASK) >> XCAN_ESR_REC_SHIFT; + + priv->can.state = new_state; + + if (cf) { + cf->can_id |= CAN_ERR_CRTL; + cf->data[6] = txerr; + cf->data[7] = rxerr; + } + + switch (new_state) { + case CAN_STATE_ERROR_PASSIVE: + priv->can.can_stats.error_passive++; + if (cf) + cf->data[1] = (rxerr > 127) ? + CAN_ERR_CRTL_RX_PASSIVE : + CAN_ERR_CRTL_TX_PASSIVE; + break; + case CAN_STATE_ERROR_WARNING: + priv->can.can_stats.error_warning++; + if (cf) + cf->data[1] |= (txerr > rxerr) ? + CAN_ERR_CRTL_TX_WARNING : + CAN_ERR_CRTL_RX_WARNING; + break; + case CAN_STATE_ERROR_ACTIVE: + if (cf) + cf->data[1] |= CAN_ERR_CRTL_ACTIVE; + break; + default: + /* non-ERROR states are handled elsewhere */ + WARN_ON(1); + break; + } +} + +/** + * xcan_update_error_state_after_rxtx - Update CAN error state after RX/TX + * @ndev: Pointer to net_device structure + * + * If the device is in a ERROR-WARNING or ERROR-PASSIVE state, check if + * the performed RX/TX has caused it to drop to a lesser state and set + * the interface state accordingly. + */ +static void xcan_update_error_state_after_rxtx(struct net_device *ndev) +{ + struct xcan_priv *priv = netdev_priv(ndev); + enum can_state old_state = priv->can.state; + enum can_state new_state; + + /* changing error state due to successful frame RX/TX can only + * occur from these states + */ + if (old_state != CAN_STATE_ERROR_WARNING && + old_state != CAN_STATE_ERROR_PASSIVE) + return; + + new_state = xcan_current_error_state(ndev); + + if (new_state != old_state) { + struct sk_buff *skb; + struct can_frame *cf; + + skb = alloc_can_err_skb(ndev, &cf); + + xcan_set_error_state(ndev, new_state, skb ? cf : NULL); + + if (skb) { + struct net_device_stats *stats = &ndev->stats; + + stats->rx_packets++; + stats->rx_bytes += cf->can_dlc; + netif_rx(skb); + } + } +} + +/** * xcan_err_interrupt - error frame Isr * @ndev: net_device pointer * @isr: interrupt status register value @@ -543,16 +684,12 @@ static void xcan_err_interrupt(struct net_device *ndev, u32 isr) struct net_device_stats *stats = &ndev->stats; struct can_frame *cf; struct sk_buff *skb; - u32 err_status, status, txerr = 0, rxerr = 0; + u32 err_status; skb = alloc_can_err_skb(ndev, &cf); err_status = priv->read_reg(priv, XCAN_ESR_OFFSET); priv->write_reg(priv, XCAN_ESR_OFFSET, err_status); - txerr = priv->read_reg(priv, XCAN_ECR_OFFSET) & XCAN_ECR_TEC_MASK; - rxerr = ((priv->read_reg(priv, XCAN_ECR_OFFSET) & - XCAN_ECR_REC_MASK) >> XCAN_ESR_REC_SHIFT); - status = priv->read_reg(priv, XCAN_SR_OFFSET); if (isr & XCAN_IXR_BSOFF_MASK) { priv->can.state = CAN_STATE_BUS_OFF; @@ -562,28 +699,10 @@ static void xcan_err_interrupt(struct net_device *ndev, u32 isr) can_bus_off(ndev); if (skb) cf->can_id |= CAN_ERR_BUSOFF; - } else if ((status & XCAN_SR_ESTAT_MASK) == XCAN_SR_ESTAT_MASK) { - priv->can.state = CAN_STATE_ERROR_PASSIVE; - priv->can.can_stats.error_passive++; - if (skb) { - cf->can_id |= CAN_ERR_CRTL; - cf->data[1] = (rxerr > 127) ? - CAN_ERR_CRTL_RX_PASSIVE : - CAN_ERR_CRTL_TX_PASSIVE; - cf->data[6] = txerr; - cf->data[7] = rxerr; - } - } else if (status & XCAN_SR_ERRWRN_MASK) { - priv->can.state = CAN_STATE_ERROR_WARNING; - priv->can.can_stats.error_warning++; - if (skb) { - cf->can_id |= CAN_ERR_CRTL; - cf->data[1] |= (txerr > rxerr) ? - CAN_ERR_CRTL_TX_WARNING : - CAN_ERR_CRTL_RX_WARNING; - cf->data[6] = txerr; - cf->data[7] = rxerr; - } + } else { + enum can_state new_state = xcan_current_error_state(ndev); + + xcan_set_error_state(ndev, new_state, skb ? cf : NULL); } /* Check for Arbitration lost interrupt */ @@ -599,7 +718,6 @@ static void xcan_err_interrupt(struct net_device *ndev, u32 isr) if (isr & XCAN_IXR_RXOFLW_MASK) { stats->rx_over_errors++; stats->rx_errors++; - priv->write_reg(priv, XCAN_SRR_OFFSET, XCAN_SRR_RESET_MASK); if (skb) { cf->can_id |= CAN_ERR_CRTL; cf->data[1] |= CAN_ERR_CRTL_RX_OVERFLOW; @@ -708,26 +826,20 @@ static int xcan_rx_poll(struct napi_struct *napi, int quota) isr = priv->read_reg(priv, XCAN_ISR_OFFSET); while ((isr & XCAN_IXR_RXNEMP_MASK) && (work_done < quota)) { - if (isr & XCAN_IXR_RXOK_MASK) { - priv->write_reg(priv, XCAN_ICR_OFFSET, - XCAN_IXR_RXOK_MASK); - work_done += xcan_rx(ndev); - } else { - priv->write_reg(priv, XCAN_ICR_OFFSET, - XCAN_IXR_RXNEMP_MASK); - break; - } + work_done += xcan_rx(ndev); priv->write_reg(priv, XCAN_ICR_OFFSET, XCAN_IXR_RXNEMP_MASK); isr = priv->read_reg(priv, XCAN_ISR_OFFSET); } - if (work_done) + if (work_done) { can_led_event(ndev, CAN_LED_EVENT_RX); + xcan_update_error_state_after_rxtx(ndev); + } if (work_done < quota) { napi_complete(napi); ier = priv->read_reg(priv, XCAN_IER_OFFSET); - ier |= (XCAN_IXR_RXOK_MASK | XCAN_IXR_RXNEMP_MASK); + ier |= XCAN_IXR_RXNEMP_MASK; priv->write_reg(priv, XCAN_IER_OFFSET, ier); } return work_done; @@ -742,18 +854,71 @@ static void xcan_tx_interrupt(struct net_device *ndev, u32 isr) { struct xcan_priv *priv = netdev_priv(ndev); struct net_device_stats *stats = &ndev->stats; + unsigned int frames_in_fifo; + int frames_sent = 1; /* TXOK => at least 1 frame was sent */ + unsigned long flags; + int retries = 0; + + /* Synchronize with xmit as we need to know the exact number + * of frames in the FIFO to stay in sync due to the TXFEMP + * handling. + * This also prevents a race between netif_wake_queue() and + * netif_stop_queue(). + */ + spin_lock_irqsave(&priv->tx_lock, flags); - while ((priv->tx_head - priv->tx_tail > 0) && - (isr & XCAN_IXR_TXOK_MASK)) { + frames_in_fifo = priv->tx_head - priv->tx_tail; + + if (WARN_ON_ONCE(frames_in_fifo == 0)) { + /* clear TXOK anyway to avoid getting back here */ priv->write_reg(priv, XCAN_ICR_OFFSET, XCAN_IXR_TXOK_MASK); + spin_unlock_irqrestore(&priv->tx_lock, flags); + return; + } + + /* Check if 2 frames were sent (TXOK only means that at least 1 + * frame was sent). + */ + if (frames_in_fifo > 1) { + WARN_ON(frames_in_fifo > priv->tx_max); + + /* Synchronize TXOK and isr so that after the loop: + * (1) isr variable is up-to-date at least up to TXOK clear + * time. This avoids us clearing a TXOK of a second frame + * but not noticing that the FIFO is now empty and thus + * marking only a single frame as sent. + * (2) No TXOK is left. Having one could mean leaving a + * stray TXOK as we might process the associated frame + * via TXFEMP handling as we read TXFEMP *after* TXOK + * clear to satisfy (1). + */ + while ((isr & XCAN_IXR_TXOK_MASK) && !WARN_ON(++retries == 100)) { + priv->write_reg(priv, XCAN_ICR_OFFSET, XCAN_IXR_TXOK_MASK); + isr = priv->read_reg(priv, XCAN_ISR_OFFSET); + } + + if (isr & XCAN_IXR_TXFEMP_MASK) { + /* nothing in FIFO anymore */ + frames_sent = frames_in_fifo; + } + } else { + /* single frame in fifo, just clear TXOK */ + priv->write_reg(priv, XCAN_ICR_OFFSET, XCAN_IXR_TXOK_MASK); + } + + while (frames_sent--) { can_get_echo_skb(ndev, priv->tx_tail % priv->tx_max); priv->tx_tail++; stats->tx_packets++; - isr = priv->read_reg(priv, XCAN_ISR_OFFSET); } - can_led_event(ndev, CAN_LED_EVENT_TX); + netif_wake_queue(ndev); + + spin_unlock_irqrestore(&priv->tx_lock, flags); + + can_led_event(ndev, CAN_LED_EVENT_TX); + xcan_update_error_state_after_rxtx(ndev); } /** @@ -772,6 +937,7 @@ static irqreturn_t xcan_interrupt(int irq, void *dev_id) struct net_device *ndev = (struct net_device *)dev_id; struct xcan_priv *priv = netdev_priv(ndev); u32 isr, ier; + u32 isr_errors; /* Get the interrupt status from Xilinx CAN */ isr = priv->read_reg(priv, XCAN_ISR_OFFSET); @@ -790,18 +956,17 @@ static irqreturn_t xcan_interrupt(int irq, void *dev_id) xcan_tx_interrupt(ndev, isr); /* Check for the type of error interrupt and Processing it */ - if (isr & (XCAN_IXR_ERROR_MASK | XCAN_IXR_RXOFLW_MASK | - XCAN_IXR_BSOFF_MASK | XCAN_IXR_ARBLST_MASK)) { - priv->write_reg(priv, XCAN_ICR_OFFSET, (XCAN_IXR_ERROR_MASK | - XCAN_IXR_RXOFLW_MASK | XCAN_IXR_BSOFF_MASK | - XCAN_IXR_ARBLST_MASK)); + isr_errors = isr & (XCAN_IXR_ERROR_MASK | XCAN_IXR_RXOFLW_MASK | + XCAN_IXR_BSOFF_MASK | XCAN_IXR_ARBLST_MASK); + if (isr_errors) { + priv->write_reg(priv, XCAN_ICR_OFFSET, isr_errors); xcan_err_interrupt(ndev, isr); } /* Check for the type of receive interrupt and Processing it */ - if (isr & (XCAN_IXR_RXNEMP_MASK | XCAN_IXR_RXOK_MASK)) { + if (isr & XCAN_IXR_RXNEMP_MASK) { ier = priv->read_reg(priv, XCAN_IER_OFFSET); - ier &= ~(XCAN_IXR_RXNEMP_MASK | XCAN_IXR_RXOK_MASK); + ier &= ~XCAN_IXR_RXNEMP_MASK; priv->write_reg(priv, XCAN_IER_OFFSET, ier); napi_schedule(&priv->napi); } @@ -1030,6 +1195,18 @@ static int __maybe_unused xcan_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(xcan_dev_pm_ops, xcan_suspend, xcan_resume); +static const struct xcan_devtype_data xcan_zynq_data = { + .caps = XCAN_CAP_WATERMARK, +}; + +/* Match table for OF platform binding */ +static const struct of_device_id xcan_of_match[] = { + { .compatible = "xlnx,zynq-can-1.0", .data = &xcan_zynq_data }, + { .compatible = "xlnx,axi-can-1.00.a", }, + { /* end of list */ }, +}; +MODULE_DEVICE_TABLE(of, xcan_of_match); + /** * xcan_probe - Platform registration call * @pdev: Handle to the platform device structure @@ -1044,8 +1221,10 @@ static int xcan_probe(struct platform_device *pdev) struct resource *res; /* IO mem resources */ struct net_device *ndev; struct xcan_priv *priv; + const struct of_device_id *of_id; + int caps = 0; void __iomem *addr; - int ret, rx_max, tx_max; + int ret, rx_max, tx_max, tx_fifo_depth; /* Get the virtual base address for the device */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -1055,7 +1234,8 @@ static int xcan_probe(struct platform_device *pdev) goto err; } - ret = of_property_read_u32(pdev->dev.of_node, "tx-fifo-depth", &tx_max); + ret = of_property_read_u32(pdev->dev.of_node, "tx-fifo-depth", + &tx_fifo_depth); if (ret < 0) goto err; @@ -1063,6 +1243,30 @@ static int xcan_probe(struct platform_device *pdev) if (ret < 0) goto err; + of_id = of_match_device(xcan_of_match, &pdev->dev); + if (of_id) { + const struct xcan_devtype_data *devtype_data = of_id->data; + + if (devtype_data) + caps = devtype_data->caps; + } + + /* There is no way to directly figure out how many frames have been + * sent when the TXOK interrupt is processed. If watermark programming + * is supported, we can have 2 frames in the FIFO and use TXFEMP + * to determine if 1 or 2 frames have been sent. + * Theoretically we should be able to use TXFWMEMP to determine up + * to 3 frames, but it seems that after putting a second frame in the + * FIFO, with watermark at 2 frames, it can happen that TXFWMEMP (less + * than 2 frames in FIFO) is set anyway with no TXOK (a frame was + * sent), which is not a sensible state - possibly TXFWMEMP is not + * completely synchronized with the rest of the bits? + */ + if (caps & XCAN_CAP_WATERMARK) + tx_max = min(tx_fifo_depth, 2); + else + tx_max = 1; + /* Create a CAN device instance */ ndev = alloc_candev(sizeof(struct xcan_priv), tx_max); if (!ndev) @@ -1077,6 +1281,7 @@ static int xcan_probe(struct platform_device *pdev) CAN_CTRLMODE_BERR_REPORTING; priv->reg_base = addr; priv->tx_max = tx_max; + spin_lock_init(&priv->tx_lock); /* Get IRQ for the device */ ndev->irq = platform_get_irq(pdev, 0); @@ -1144,9 +1349,9 @@ static int xcan_probe(struct platform_device *pdev) devm_can_led_init(ndev); clk_disable_unprepare(priv->bus_clk); clk_disable_unprepare(priv->can_clk); - netdev_dbg(ndev, "reg_base=0x%p irq=%d clock=%d, tx fifo depth:%d\n", + netdev_dbg(ndev, "reg_base=0x%p irq=%d clock=%d, tx fifo depth: actual %d, using %d\n", priv->reg_base, ndev->irq, priv->can.clock.freq, - priv->tx_max); + tx_fifo_depth, priv->tx_max); return 0; @@ -1182,14 +1387,6 @@ static int xcan_remove(struct platform_device *pdev) return 0; } -/* Match table for OF platform binding */ -static const struct of_device_id xcan_of_match[] = { - { .compatible = "xlnx,zynq-can-1.0", }, - { .compatible = "xlnx,axi-can-1.00.a", }, - { /* end of list */ }, -}; -MODULE_DEVICE_TABLE(of, xcan_of_match); - static struct platform_driver xcan_driver = { .probe = xcan_probe, .remove = xcan_remove, diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 8b1929e9f698..ec5834087e4b 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1063,7 +1063,8 @@ static int bcm_enet_open(struct net_device *dev) val = enet_readl(priv, ENET_CTL_REG); val |= ENET_CTL_ENABLE_MASK; enet_writel(priv, val, ENET_CTL_REG); - enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG); + if (priv->dma_has_sram) + enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG); enet_dmac_writel(priv, priv->dma_chan_en_mask, ENETDMAC_CHANCFG, priv->rx_chan); @@ -1787,7 +1788,9 @@ static int bcm_enet_probe(struct platform_device *pdev) ret = PTR_ERR(priv->mac_clk); goto out; } - clk_prepare_enable(priv->mac_clk); + ret = clk_prepare_enable(priv->mac_clk); + if (ret) + goto out_put_clk_mac; /* initialize default and fetch platform data */ priv->rx_ring_size = BCMENET_DEF_RX_DESC; @@ -1819,9 +1822,11 @@ static int bcm_enet_probe(struct platform_device *pdev) if (IS_ERR(priv->phy_clk)) { ret = PTR_ERR(priv->phy_clk); priv->phy_clk = NULL; - goto out_put_clk_mac; + goto out_disable_clk_mac; } - clk_prepare_enable(priv->phy_clk); + ret = clk_prepare_enable(priv->phy_clk); + if (ret) + goto out_put_clk_phy; } /* do minimal hardware init to be able to probe mii bus */ @@ -1921,13 +1926,16 @@ out_free_mdio: out_uninit_hw: /* turn off mdc clock */ enet_writel(priv, 0, ENET_MIISC_REG); - if (priv->phy_clk) { + if (priv->phy_clk) clk_disable_unprepare(priv->phy_clk); + +out_put_clk_phy: + if (priv->phy_clk) clk_put(priv->phy_clk); - } -out_put_clk_mac: +out_disable_clk_mac: clk_disable_unprepare(priv->mac_clk); +out_put_clk_mac: clk_put(priv->mac_clk); out: free_netdev(dev); @@ -2772,7 +2780,9 @@ static int bcm_enetsw_probe(struct platform_device *pdev) ret = PTR_ERR(priv->mac_clk); goto out_unmap; } - clk_enable(priv->mac_clk); + ret = clk_prepare_enable(priv->mac_clk); + if (ret) + goto out_put_clk; priv->rx_chan = 0; priv->tx_chan = 1; @@ -2793,7 +2803,7 @@ static int bcm_enetsw_probe(struct platform_device *pdev) ret = register_netdev(dev); if (ret) - goto out_put_clk; + goto out_disable_clk; netif_carrier_off(dev); platform_set_drvdata(pdev, dev); @@ -2802,6 +2812,9 @@ static int bcm_enetsw_probe(struct platform_device *pdev) return 0; +out_disable_clk: + clk_disable_unprepare(priv->mac_clk); + out_put_clk: clk_put(priv->mac_clk); @@ -2833,6 +2846,9 @@ static int bcm_enetsw_remove(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); release_mem_region(res->start, resource_size(res)); + clk_disable_unprepare(priv->mac_clk); + clk_put(priv->mac_clk); + free_netdev(dev); return 0; } diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1325825d5225..ce3a56bea6e6 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -9278,6 +9278,15 @@ static int tg3_chip_reset(struct tg3 *tp) tg3_restore_clk(tp); + /* Increase the core clock speed to fix tx timeout issue for 5762 + * with 100Mbps link speed. + */ + if (tg3_asic_rev(tp) == ASIC_REV_5762) { + val = tr32(TG3_CPMU_CLCK_ORIDE_ENABLE); + tw32(TG3_CPMU_CLCK_ORIDE_ENABLE, val | + TG3_CPMU_MAC_ORIDE_ENABLE); + } + /* Reprobe ASF enable state. */ tg3_flag_clear(tp, ENABLE_ASF); tp->phy_flags &= ~(TG3_PHYFLG_1G_ON_VAUX_OK | diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 8f7aa53a4c4b..7ae8374bff13 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -50,6 +50,7 @@ #include <linux/stringify.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/nospec.h> #include <asm/uaccess.h> #include "common.h" @@ -2256,6 +2257,7 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) if (t.qset_idx >= nqsets) return -EINVAL; + t.qset_idx = array_index_nospec(t.qset_idx, nqsets); q = &adapter->params.sge.qset[q1 + t.qset_idx]; t.rspq_size = q->rspq_size; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index e3080fbd9d00..7911dc3da98e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2891,7 +2891,7 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, u32 srqn = qp_get_srqn(qpc) & 0xffffff; int use_srq = (qp_get_srqn(qpc) >> 24) & 1; struct res_srq *srq; - int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; + int local_qpn = vhcr->in_modifier & 0xffffff; err = adjust_qp_sched_queue(dev, slave, qpc, inbox); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 16bd585365a8..9ac14df0ca3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -643,6 +643,7 @@ static void cmd_work_handler(struct work_struct *work) struct semaphore *sem; unsigned long flags; int alloc_ret; + int cmd_mode; sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); @@ -688,6 +689,7 @@ static void cmd_work_handler(struct work_struct *work) set_signature(ent, !cmd->checksum_disabled); dump_command(dev, ent, 1); ent->ts1 = ktime_get_ns(); + cmd_mode = cmd->mode; /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); @@ -695,7 +697,7 @@ static void cmd_work_handler(struct work_struct *work) iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell); mmiowb(); /* if not in polling don't use ent after this point */ - if (cmd->mode == CMD_MODE_POLLING) { + if (cmd_mode == CMD_MODE_POLLING) { poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); @@ -1126,7 +1128,7 @@ static ssize_t outlen_write(struct file *filp, const char __user *buf, { struct mlx5_core_dev *dev = filp->private_data; struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; - char outlen_str[8]; + char outlen_str[8] = {0}; int outlen; void *ptr; int err; @@ -1141,8 +1143,6 @@ static ssize_t outlen_write(struct file *filp, const char __user *buf, if (copy_from_user(outlen_str, buf, count)) return -EFAULT; - outlen_str[7] = 0; - err = sscanf(outlen_str, "%d", &outlen); if (err < 0) return err; diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 174f7341c5c3..688b6da5a9bb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -22,6 +22,7 @@ #include <linux/etherdevice.h> #include <linux/vmalloc.h> #include <linux/qed/qed_if.h> +#include <linux/crash_dump.h> #include "qed.h" #include "qed_sp.h" @@ -634,6 +635,14 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, /* We want a minimum of one slowpath and one fastpath vector per hwfn */ cdev->int_params.in.min_msix_cnt = cdev->num_hwfns * 2; + if (is_kdump_kernel()) { + DP_INFO(cdev, + "Kdump kernel: Limit the max number of requested MSI-X vectors to %hd\n", + cdev->int_params.in.min_msix_cnt); + cdev->int_params.in.num_vectors = + cdev->int_params.in.min_msix_cnt; + } + rc = qed_set_int_mode(cdev, false); if (rc) { DP_ERR(cdev, "qed_slowpath_setup_int ERR\n"); diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index e23a642357e7..eb4d8df49399 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -60,8 +60,7 @@ #include <linux/sungem_phy.h> #include "sungem.h" -/* Stripping FCS is causing problems, disabled for now */ -#undef STRIP_FCS +#define STRIP_FCS #define DEFAULT_MSG (NETIF_MSG_DRV | \ NETIF_MSG_PROBE | \ @@ -435,7 +434,7 @@ static int gem_rxmac_reset(struct gem *gp) writel(desc_dma & 0xffffffff, gp->regs + RXDMA_DBLOW); writel(RX_RING_SIZE - 4, gp->regs + RXDMA_KICK); val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) | - ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128); + (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128); writel(val, gp->regs + RXDMA_CFG); if (readl(gp->regs + GREG_BIFCFG) & GREG_BIFCFG_M66EN) writel(((5 & RXDMA_BLANK_IPKTS) | @@ -760,7 +759,6 @@ static int gem_rx(struct gem *gp, int work_to_do) struct net_device *dev = gp->dev; int entry, drops, work_done = 0; u32 done; - __sum16 csum; if (netif_msg_rx_status(gp)) printk(KERN_DEBUG "%s: rx interrupt, done: %d, rx_new: %d\n", @@ -855,9 +853,13 @@ static int gem_rx(struct gem *gp, int work_to_do) skb = copy_skb; } - csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff); - skb->csum = csum_unfold(csum); - skb->ip_summed = CHECKSUM_COMPLETE; + if (likely(dev->features & NETIF_F_RXCSUM)) { + __sum16 csum; + + csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff); + skb->csum = csum_unfold(csum); + skb->ip_summed = CHECKSUM_COMPLETE; + } skb->protocol = eth_type_trans(skb, gp->dev); napi_gro_receive(&gp->napi, skb); @@ -1755,7 +1757,7 @@ static void gem_init_dma(struct gem *gp) writel(0, gp->regs + TXDMA_KICK); val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) | - ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128); + (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128); writel(val, gp->regs + RXDMA_CFG); writel(desc_dma >> 32, gp->regs + RXDMA_DBHI); @@ -2973,8 +2975,8 @@ static int gem_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, dev); /* We can do scatter/gather and HW checksum */ - dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= dev->hw_features | NETIF_F_RXCSUM; + dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->features = dev->hw_features; if (pci_using_dac) dev->features |= NETIF_F_HIGHDMA; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8179727d3423..1f2f25a71d18 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1265,11 +1265,8 @@ static int gen10g_resume(struct phy_device *phydev) static int __set_phy_supported(struct phy_device *phydev, u32 max_speed) { - /* The default values for phydev->supported are provided by the PHY - * driver "features" member, we want to reset to sane defaults first - * before supporting higher speeds. - */ - phydev->supported &= PHY_DEFAULT_FEATURES; + phydev->supported &= ~(PHY_1000BT_FEATURES | PHY_100BT_FEATURES | + PHY_10BT_FEATURES); switch (max_speed) { default: diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 2991d7155540..2bb336cb13ee 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3139,7 +3139,8 @@ static int rtl8152_close(struct net_device *netdev) #ifdef CONFIG_PM_SLEEP unregister_pm_notifier(&tp->pm_notifier); #endif - napi_disable(&tp->napi); + if (!test_bit(RTL8152_UNPLUG, &tp->flags)) + napi_disable(&tp->napi); clear_bit(WORK_ENABLE, &tp->flags); usb_kill_urb(tp->intr_urb); cancel_delayed_work_sync(&tp->schedule); diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 8b537a5a4b01..8006f0972ad1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -135,7 +135,6 @@ found_alt: firmware->size); rtlpriv->rtlhal.wowlan_fwsize = firmware->size; } - rtlpriv->rtlhal.fwsize = firmware->size; release_firmware(firmware); } diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index da7bae991552..d877ff124365 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -88,6 +88,7 @@ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin, case PTP_PF_PHYSYNC: if (chan != 0) return -EINVAL; + break; default: return -EINVAL; } diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 7ed30d0b5273..a501f3ba6a3f 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1771,6 +1771,9 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x09d8, 0x0320), /* Elatec GmbH TWN3 */ .driver_info = NO_UNION_NORMAL, /* has misplaced union descriptor */ }, + { USB_DEVICE(0x0ca6, 0xa050), /* Castles VEGA3000 */ + .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ + }, { USB_DEVICE(0x2912, 0x0001), /* ATOL FPrint */ .driver_info = CLEAR_HALT_CONDITIONS, diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 4ecbb36e8252..037869b6b493 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1134,10 +1134,14 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (!udev || udev->state == USB_STATE_NOTATTACHED) { /* Tell hub_wq to disconnect the device or - * check for a new connection + * check for a new connection or over current condition. + * Based on USB2.0 Spec Section 11.12.5, + * C_PORT_OVER_CURRENT could be set while + * PORT_OVER_CURRENT is not. So check for any of them. */ if (udev || (portstatus & USB_PORT_STAT_CONNECTION) || - (portstatus & USB_PORT_STAT_OVERCURRENT)) + (portstatus & USB_PORT_STAT_OVERCURRENT) || + (portchange & USB_PORT_STAT_C_OVERCURRENT)) set_bit(port1, hub->change_bits); } else if (portstatus & USB_PORT_STAT_ENABLE) { diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 5ccc09888345..f97193011bed 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3565,7 +3565,7 @@ static int ffs_func_setup(struct usb_function *f, ffs_log("exit"); - return USB_GADGET_DELAYED_STATUS; + return creq->wLength == 0 ? USB_GADGET_DELAYED_STATUS : 0; } static void ffs_func_suspend(struct usb_function *f) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 50a1b0a34617..dfc02a6c6d55 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -909,6 +909,41 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci) spin_unlock_irqrestore(&xhci->lock, flags); } +static bool xhci_pending_portevent(struct xhci_hcd *xhci) +{ + __le32 __iomem **port_array; + int port_index; + u32 status; + u32 portsc; + + status = readl(&xhci->op_regs->status); + if (status & STS_EINT) + return true; + /* + * Checking STS_EINT is not enough as there is a lag between a change + * bit being set and the Port Status Change Event that it generated + * being written to the Event Ring. See note in xhci 1.1 section 4.19.2. + */ + + port_index = xhci->num_usb2_ports; + port_array = xhci->usb2_ports; + while (port_index--) { + portsc = readl(port_array[port_index]); + if (portsc & PORT_CHANGE_MASK || + (portsc & PORT_PLS_MASK) == XDEV_RESUME) + return true; + } + port_index = xhci->num_usb3_ports; + port_array = xhci->usb3_ports; + while (port_index--) { + portsc = readl(port_array[port_index]); + if (portsc & PORT_CHANGE_MASK || + (portsc & PORT_PLS_MASK) == XDEV_RESUME) + return true; + } + return false; +} + /* * Stop HC (not bus-specific) * @@ -1006,7 +1041,7 @@ EXPORT_SYMBOL_GPL(xhci_suspend); */ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) { - u32 command, temp = 0, status; + u32 command, temp = 0; struct usb_hcd *hcd = xhci_to_hcd(xhci); struct usb_hcd *secondary_hcd; int retval = 0; @@ -1128,8 +1163,7 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) done: if (retval == 0) { /* Resume root hubs only when have pending events. */ - status = readl(&xhci->op_regs->status); - if (status & STS_EINT) { + if (xhci_pending_portevent(xhci)) { usb_hcd_resume_root_hub(xhci->shared_hcd); usb_hcd_resume_root_hub(hcd); } diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index c665806983be..72beaa47c15b 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -382,6 +382,10 @@ struct xhci_op_regs { #define PORT_PLC (1 << 22) /* port configure error change - port failed to configure its link partner */ #define PORT_CEC (1 << 23) +#define PORT_CHANGE_MASK (PORT_CSC | PORT_PEC | PORT_WRC | PORT_OCC | \ + PORT_RC | PORT_PLC | PORT_CEC) + + /* Cold Attach Status - xHC can set this bit to report device attached during * Sx state. Warm port reset should be perfomed to clear this bit and move port * to connected state. diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 44a5a8777053..645b2197930e 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -955,7 +955,8 @@ err_used: if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: - sockfd_put(sock); + if (sock) + sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a6c21fba6e9f..1ac142ad7d1b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -613,13 +613,21 @@ static void fat_set_state(struct super_block *sb, brelse(bh); } +static void fat_reset_iocharset(struct fat_mount_options *opts) +{ + if (opts->iocharset != fat_default_iocharset) { + /* Note: opts->iocharset can be NULL here */ + kfree(opts->iocharset); + opts->iocharset = fat_default_iocharset; + } +} + static void delayed_free(struct rcu_head *p) { struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); unload_nls(sbi->nls_disk); unload_nls(sbi->nls_io); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); kfree(sbi); } @@ -1035,7 +1043,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; - opts->iocharset = fat_default_iocharset; + fat_reset_iocharset(opts); if (is_vfat) { opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; @@ -1185,8 +1193,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, /* vfat specific */ case Opt_charset: - if (opts->iocharset != fat_default_iocharset) - kfree(opts->iocharset); + fat_reset_iocharset(opts); iocharset = match_strdup(&args[0]); if (!iocharset) return -ENOMEM; @@ -1777,8 +1784,7 @@ out_fail: iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); sb->s_fs_info = NULL; kfree(sbi); return error; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 72afdca3cea7..3c45a9301a09 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { @@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; int ret = 0; @@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) ret = -EEXIST; @@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) @@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, cluster->cl_local_node = node->nd_num; } - return count; + ret = count; + +out: + o2nm_unlock_subsystem(); + return ret; } CONFIGFS_ATTR(o2nm_node_, num); @@ -750,6 +787,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); diff --git a/fs/proc/array.c b/fs/proc/array.c index d5c6f5b38617..161441f52ebf 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -79,6 +79,7 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/string_helpers.h> @@ -332,6 +333,31 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); #endif + seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_printf(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_printf(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_printf(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_printf(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_printf(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_printf(m, "globally mitigated"); + break; + default: + seq_printf(m, "vulnerable"); + break; + } + seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 557dae96ce74..143d40e8a1ea 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -65,25 +65,40 @@ #endif /* + * Feature detection for gnu_inline (gnu89 extern inline semantics). Either + * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, + * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not + * defined so the gnu89 semantics are the default. + */ +#ifdef __GNUC_STDC_INLINE__ +# define __gnu_inline __attribute__((gnu_inline)) +#else +# define __gnu_inline +#endif + +/* * Force always-inline if the user requests it so via the .config, * or if gcc is too old. * GCC does not warn about unused static inline functions for * -Wunused-function. This turns out to avoid the need for complex #ifdef * directives. Suppress the warning in clang as well by using "unused" * function attribute, which is redundant but not harmful for gcc. + * Prefer gnu_inline, so that extern inline functions do not emit an + * externally visible function. This makes extern inline behave as per gnu89 + * semantics rather than c99. This prevents multiple symbol definition errors + * of extern inline functions at link time. + * A lot of inline functions can cause havoc with function tracing. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline,unused)) notrace -#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace -#define __inline __inline __attribute__((always_inline,unused)) notrace +#define inline \ + inline __attribute__((always_inline, unused)) notrace __gnu_inline #else -/* A lot of inline functions can cause havoc with function tracing */ -#define inline inline __attribute__((unused)) notrace -#define __inline__ __inline__ __attribute__((unused)) notrace -#define __inline __inline __attribute__((unused)) notrace +#define inline inline __attribute__((unused)) notrace __gnu_inline #endif +#define __inline__ inline +#define __inline inline #define __always_inline inline __attribute__((always_inline)) #define noinline __attribute__((noinline)) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6d4c3d55d29c..76efa14a6959 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,6 +59,8 @@ extern ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/nospec.h b/include/linux/nospec.h index e791ebc65c9c..0c5ef54fd416 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -7,6 +7,8 @@ #define _LINUX_NOSPEC_H #include <asm/barrier.h> +struct task_struct; + /** * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise * @index: array element index @@ -55,4 +57,12 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, \ (typeof(_i)) (_i & _mask); \ }) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + #endif /* _LINUX_NOSPEC_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 81f422018aa1..5fe8c3dfd60f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2457,6 +2457,8 @@ static inline void memalloc_noio_restore(unsigned int flags) #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ +#define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */ +#define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/ #define TASK_PFA_TEST(name, func) \ @@ -2480,6 +2482,13 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) +TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) + +TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) +TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) + /* * task->jobctl flags */ diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2296e6b2f690..5a53d34bba26 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,7 +3,8 @@ #include <uapi/linux/seccomp.h> -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC) +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_SPEC_ALLOW) #ifdef CONFIG_SECCOMP diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a6da214d0584..c28bd8be290a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -514,6 +514,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1, * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -594,8 +595,8 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - xmit_more:1; - /* one bit hole */ + xmit_more:1, + pfmemalloc:1; kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied @@ -615,19 +616,18 @@ struct sk_buff { __u8 __pkt_type_offset[0]; __u8 pkt_type:3; - __u8 pfmemalloc:1; __u8 ignore_df:1; __u8 nfctinfo:3; - __u8 nf_trace:1; + __u8 ip_summed:2; __u8 ooo_okay:1; __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; - __u8 no_fcs:1; + /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; @@ -635,11 +635,11 @@ struct sk_buff { __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_bad:1; - #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; + __u8 inner_protocol_type:1; __u8 remcsum_offload:1; /* 3 or 5 bit hole */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 84f0d0602433..0e01d570fa22 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -762,7 +762,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ - rol32(hash, 16); + hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; diff --git a/include/net/tcp.h b/include/net/tcp.h index e4f8cde33789..d87a70b8c5e8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -391,6 +391,7 @@ extern int tcp_use_userconfig_sysctl_handler(struct ctl_table *, int, extern int tcp_proc_delayed_ack_control(struct ctl_table *, int, void __user *, size_t *, loff_t *); +void tcp_enter_quickack_mode(struct sock *sk); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { @@ -574,6 +575,7 @@ void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c1af9b3c27c4..f0e9f0460ee7 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -206,4 +206,16 @@ struct prctl_mm_map { #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 +/* Per task speculation control */ +#define PR_GET_SPECULATION_CTRL 52 +#define PR_SET_SPECULATION_CTRL 53 +/* Speculation control variants */ +# define PR_SPEC_STORE_BYPASS 0 +/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ +# define PR_SPEC_NOT_AFFECTED 0 +# define PR_SPEC_PRCTL (1UL << 0) +# define PR_SPEC_ENABLE (1UL << 1) +# define PR_SPEC_DISABLE (1UL << 2) +# define PR_SPEC_FORCE_DISABLE (1UL << 3) + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a43ff1e..e4acb615792b 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -15,7 +15,9 @@ #define SECCOMP_SET_MODE_FILTER 1 /* Valid flags for SECCOMP_SET_MODE_FILTER */ -#define SECCOMP_FILTER_FLAG_TSYNC 1 +#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) +/* In v4.14+ SECCOMP_FILTER_FLAG_LOG is (1UL << 1) */ +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) /* * All BPF programs must return a 32-bit value. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index efd384f3f852..9a9203b15cde 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -16,6 +16,8 @@ #include <linux/atomic.h> #include <linux/audit.h> #include <linux/compat.h> +#include <linux/nospec.h> +#include <linux/prctl.h> #include <linux/sched.h> #include <linux/seccomp.h> #include <linux/slab.h> @@ -214,8 +216,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) return true; } +void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } + static inline void seccomp_assign_mode(struct task_struct *task, - unsigned long seccomp_mode) + unsigned long seccomp_mode, + unsigned long flags) { assert_spin_locked(&task->sighand->siglock); @@ -225,6 +230,9 @@ static inline void seccomp_assign_mode(struct task_struct *task, * filter) is set. */ smp_mb__before_atomic(); + /* Assume default seccomp processes want spec flaw mitigation. */ + if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) + arch_seccomp_spec_mitigate(task); set_tsk_thread_flag(task, TIF_SECCOMP); } @@ -292,7 +300,7 @@ static inline pid_t seccomp_can_sync_threads(void) * without dropping the locks. * */ -static inline void seccomp_sync_threads(void) +static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; @@ -333,7 +341,8 @@ static inline void seccomp_sync_threads(void) * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) - seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, + flags); } } @@ -452,7 +461,7 @@ static long seccomp_attach_filter(unsigned int flags, /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) - seccomp_sync_threads(); + seccomp_sync_threads(flags); return 0; } @@ -747,7 +756,7 @@ static long seccomp_set_mode_strict(void) #ifdef TIF_NOTSC disable_TSC(); #endif - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, 0); ret = 0; out: @@ -805,7 +814,7 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Do not free the successfully attached filter. */ prepared = NULL; - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) diff --git a/kernel/sys.c b/kernel/sys.c index cf40663a54c2..3195435462a6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2224,6 +2224,17 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif +int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) +{ + return -EINVAL; +} + +int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, + unsigned long ctrl) +{ + return -EINVAL; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2444,6 +2455,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); + case PR_GET_SPECULATION_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_get(me, arg2); + break; + case PR_SET_SPECULATION_CTRL: + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; default: error = -EINVAL; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 51282f579760..37ea94b636a3 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -670,8 +670,16 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { - return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - (unsigned long)params->min_size); + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) @@ -728,8 +736,6 @@ int rhashtable_init(struct rhashtable *ht, struct bucket_table *tbl; size_t size; - size = HASH_DEFAULT_SIZE; - if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; @@ -756,8 +762,7 @@ int rhashtable_init(struct rhashtable *ht, ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); - if (params->nelem_hint) - size = rounded_hashtable_size(&ht->p); + size = rounded_hashtable_size(&ht->p); /* The maximum (not average) chain length grows with the * size of the hash table, at a rate of (log N)/(log log N). diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 55a9facb8e8d..9a8e688724b1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -996,7 +996,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) int nid, zid; int i; - while ((memcg = parent_mem_cgroup(memcg))) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { for_each_node(nid) { for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 665fd87cc105..8b8a43fda6ca 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -404,6 +404,12 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0); if (IS_ERR(watcher)) return PTR_ERR(watcher); + + if (watcher->family != NFPROTO_BRIDGE) { + module_put(watcher->me); + return -ENOENT; + } + w->u.watcher = watcher; par->target = watcher; @@ -724,6 +730,13 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, goto cleanup_watchers; } + /* Reject UNSPEC, xtables verdicts/return values are incompatible */ + if (target->family != NFPROTO_BRIDGE) { + module_put(target->me); + ret = -ENOENT; + goto cleanup_watchers; + } + t->u.target = target; if (t->u.target == &ebt_standard_target) { if (gap < sizeof(struct ebt_standard_target)) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2017ffa5197a..96c9c0f0905a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2087,9 +2087,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) return err; } - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - - __dev_notify_flags(dev, old_flags, ~0U); + if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { + __dev_notify_flags(dev, old_flags, 0U); + } else { + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + __dev_notify_flags(dev, old_flags, ~0U); + } return 0; } EXPORT_SYMBOL(rtnl_configure_link); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 03928b406d4c..cfbf857b68d9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -854,6 +854,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; + C(pfmemalloc); n->destructor = NULL; C(tail); C(end); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 119c04317d48..03fcf3ee1534 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -599,7 +599,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get_real(); + ktime_t now = ktime_get(); s64 delta = 0; switch (fbtype) { @@ -624,15 +624,14 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, case CCID3_FBACK_PERIODIC: delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); if (delta <= 0) - DCCP_BUG("delta (%ld) <= 0", (long)delta); - else - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); + delta = 1; + hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); break; default: return; } - ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, hc->rx_x_recv, hc->rx_pinv); hc->rx_tstamp_last_feedback = now; @@ -679,7 +678,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p, delta; + u32 x_recv, p; + s64 delta; u64 fval; if (hc->rx_rtt == 0) { @@ -687,7 +687,9 @@ static u32 ccid3_first_li(struct sock *sk) hc->rx_rtt = DCCP_FALLBACK_RTT; } - delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback)); + delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); + if (delta <= 0) + delta = 1; x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index e26df2764e83..1689c7bdf1c9 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -87,35 +87,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) opt++; kdebug("options: '%s'", opt); do { + int opt_len, opt_nlen; const char *eq; - int opt_len, opt_nlen, opt_vlen, tmp; + char optval[128]; next_opt = memchr(opt, '#', end - opt) ?: end; opt_len = next_opt - opt; - if (opt_len <= 0 || opt_len > 128) { + if (opt_len <= 0 || opt_len > sizeof(optval)) { pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n", opt_len); return -EINVAL; } - eq = memchr(opt, '=', opt_len) ?: end; - opt_nlen = eq - opt; - eq++; - opt_vlen = next_opt - eq; /* will be -1 if no value */ + eq = memchr(opt, '=', opt_len); + if (eq) { + opt_nlen = eq - opt; + eq++; + memcpy(optval, eq, next_opt - eq); + optval[next_opt - eq] = '\0'; + } else { + opt_nlen = opt_len; + optval[0] = '\0'; + } - tmp = opt_vlen >= 0 ? opt_vlen : 0; - kdebug("option '%*.*s' val '%*.*s'", - opt_nlen, opt_nlen, opt, tmp, tmp, eq); + kdebug("option '%*.*s' val '%s'", + opt_nlen, opt_nlen, opt, optval); /* see if it's an error number representing a DNS error * that's to be recorded as the result in this key */ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { kdebug("dns error number option"); - if (opt_vlen <= 0) - goto bad_option_value; - ret = kstrtoul(eq, 10, &derrno); + ret = kstrtoul(optval, 10, &derrno); if (ret < 0) goto bad_option_value; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7dc9f0680bf6..65a2adfa451f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -297,6 +297,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), .flowi4_scope = scope, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c1d7dc433976..ac2966f02d07 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -480,6 +480,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ce9a7fbb7c5f..88426a6a7a85 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -135,15 +135,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; const struct iphdr *iph = ip_hdr(skb); - __be16 *ports = (__be16 *)skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 > skb->len) + end = skb_transport_offset(skb) + 4; + if (end > 0 && !pskb_may_pull(skb, end)) return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; sin.sin_addr.s_addr = iph->daddr; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 8233e27679f2..5d563615718d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -145,8 +145,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); - if (!gid_valid(low) || !gid_valid(high) || - (urange[1] < urange[0]) || gid_lt(high, low)) { + if (!gid_valid(low) || !gid_valid(high)) + return -EINVAL; + if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } @@ -232,8 +233,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, { struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -242,11 +244,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, rcu_read_lock(); ctxt = rcu_dereference(tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -262,12 +267,16 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, * first invocation of tcp_fastopen_cookie_gen */ tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 55d7da1d2ce9..e63b764e55ea 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -131,23 +131,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=0 to CE=1 and delayed - * ACK has not sent yet. - */ - if (!ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=0. */ - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (!ca->ce_state) { + /* State has changed from CE=0 to CE=1, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk); } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -161,23 +152,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=1 to CE=0 and delayed - * ACK has not sent yet. - */ - if (ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=1. */ - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (ca->ce_state) { + /* State has changed from CE=1 to CE=0, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk); } ca->prior_rcv_nxt = tp->rcv_nxt; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2101686ee2ab..f44362ef3e6c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -188,13 +188,14 @@ static void tcp_incr_quickack(struct sock *sk) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -static void tcp_enter_quickack_mode(struct sock *sk) +void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } +EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -3219,6 +3220,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e490c9a29034..b7c333c7de8b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -177,8 +177,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -901,8 +906,8 @@ out: * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -962,7 +967,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); + th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); @@ -1005,7 +1010,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, sk); @@ -1036,6 +1041,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return net_xmit_eval(err); } +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -3354,7 +3366,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; @@ -3391,9 +3403,14 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ skb_mstamp_get(&buff->skb_mstamp); - tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); + __tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC), rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_send_ack); + +void tcp_send_ack(struct sock *sk) +{ + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index d7c1ee7cf0e2..7a62fd9173d0 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -663,13 +663,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; - __be16 *ports = (__be16 *) skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 <= skb->len) { + end = skb_transport_offset(skb) + 4; + if (end <= 0 || pskb_may_pull(skb, end)) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 74786783834b..0feede45bd28 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -559,6 +559,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index dec4e7bda5f3..11282ffca567 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -692,7 +692,6 @@ static int ipip6_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6))) goto out; - iph = ip_hdr(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index d25212b135ea..04f060488686 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -754,11 +754,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); - pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, + pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0, frag_len + LLCP_HEADER_SIZE, &err); if (pdu == NULL) { - pr_err("Could not allocate PDU\n"); - continue; + pr_err("Could not allocate PDU (error=%d)\n", err); + len -= remaining_len; + if (len == 0) + len = err; + break; } pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); diff --git a/net/rds/loop.c b/net/rds/loop.c index 6b12b68541ae..05cab8c5a379 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -191,4 +191,5 @@ struct rds_transport rds_loop_transport = { .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", + .t_type = RDS_TRANS_LOOP, }; diff --git a/net/rds/rds.h b/net/rds/rds.h index 4588860f4c3b..254f1345cf7e 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -401,6 +401,11 @@ struct rds_notifier { int n_status; }; +/* Available as part of RDS core, so doesn't need to participate + * in get_preferred transport etc + */ +#define RDS_TRANS_LOOP 3 + /** * struct rds_transport - transport specific behavioural hooks * diff --git a/net/rds/recv.c b/net/rds/recv.c index 0514af3ab378..6275de19689c 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -76,6 +76,11 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, return; rs->rs_rcv_bytes += delta; + + /* loop transport doesn't send/recv congestion updates */ + if (rs->rs_transport->t_type == RDS_TRANS_LOOP) + return; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 3fee70d9814f..562edd50fa94 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -20,7 +20,7 @@ static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) { qdisc_drop(skb, sch); - return NET_XMIT_SUCCESS; + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 882fe83a3554..b3f345433ec7 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1476,15 +1476,19 @@ TEST_F(TRACE_syscall, syscall_dropped) #define SECCOMP_SET_MODE_FILTER 1 #endif -#ifndef SECCOMP_FLAG_FILTER_TSYNC -#define SECCOMP_FLAG_FILTER_TSYNC 1 +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) +#endif + +#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) #endif #ifndef seccomp -int seccomp(unsigned int op, unsigned int flags, struct sock_fprog *filter) +int seccomp(unsigned int op, unsigned int flags, void *args) { errno = 0; - return syscall(__NR_seccomp, op, flags, filter); + return syscall(__NR_seccomp, op, flags, args); } #endif @@ -1576,6 +1580,78 @@ TEST(seccomp_syscall_mode_lock) } } +/* + * Test detection of known and unknown filter flags. Userspace needs to be able + * to check if a filter flag is supported by the current kernel and a good way + * of doing that is by attempting to enter filter mode, with the flag bit in + * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates + * that the flag is valid and EINVAL indicates that the flag is invalid. + */ +TEST(detect_seccomp_filter_flags) +{ + unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, + SECCOMP_FILTER_FLAG_SPEC_ALLOW }; + unsigned int flag, all_flags; + int i; + long ret; + + /* Test detection of known-good filter flags */ + for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) { + int bits = 0; + + flag = flags[i]; + /* Make sure the flag is a single bit! */ + while (flag) { + if (flag & 0x1) + bits ++; + flag >>= 1; + } + ASSERT_EQ(1, bits); + flag = flags[i]; + + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!", + flag); + } + + all_flags |= flag; + } + + /* Test detection of all known-good filter flags */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!", + all_flags); + } + + /* Test detection of an unknown filter flag */ + flag = -1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!", + flag); + } + + /* + * Test detection of an unknown filter flag that may simply need to be + * added to this test + */ + flag = flags[ARRAY_SIZE(flags) - 1] << 1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?", + flag); + } +} + TEST(TSYNC_first) { struct sock_filter filter[] = { @@ -1592,7 +1668,7 @@ TEST(TSYNC_first) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &prog); ASSERT_NE(ENOSYS, errno) { TH_LOG("Kernel does not support seccomp syscall!"); @@ -1810,7 +1886,7 @@ TEST_F(TSYNC, two_siblings_with_ancestor) self->sibling_count++; } - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_EQ(0, ret) { TH_LOG("Could install filter on all threads!"); @@ -1871,7 +1947,7 @@ TEST_F(TSYNC, two_siblings_with_no_filter) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_NE(ENOSYS, errno) { TH_LOG("Kernel does not support seccomp syscall!"); @@ -1919,7 +1995,7 @@ TEST_F(TSYNC, two_siblings_with_one_divergence) self->sibling_count++; } - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_EQ(self->sibling[0].system_tid, ret) { TH_LOG("Did not fail on diverged sibling."); @@ -1971,7 +2047,7 @@ TEST_F(TSYNC, two_siblings_not_under_filter) TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); } - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_EQ(ret, self->sibling[0].system_tid) { TH_LOG("Did not fail on diverged sibling."); @@ -2000,7 +2076,7 @@ TEST_F(TSYNC, two_siblings_not_under_filter) /* Switch to the remaining sibling */ sib = !sib; - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_EQ(0, ret) { TH_LOG("Expected the remaining sibling to sync"); @@ -2023,7 +2099,7 @@ TEST_F(TSYNC, two_siblings_not_under_filter) while (!kill(self->sibling[sib].system_tid, 0)) sleep(0.1); - ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &self->apply_prog); ASSERT_EQ(0, ret); /* just us chickens */ } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 49001fa84ead..1203829316b2 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -119,8 +119,12 @@ irqfd_shutdown(struct work_struct *work) { struct kvm_kernel_irqfd *irqfd = container_of(work, struct kvm_kernel_irqfd, shutdown); + struct kvm *kvm = irqfd->kvm; u64 cnt; + /* Make sure irqfd has been initalized in assign path. */ + synchronize_srcu(&kvm->irq_srcu); + /* * Synchronize with the wait-queue and unhook ourselves to prevent * further events. @@ -387,7 +391,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) idx = srcu_read_lock(&kvm->irq_srcu); irqfd_update(kvm, irqfd); - srcu_read_unlock(&kvm->irq_srcu, idx); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -419,6 +422,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) irqfd->consumer.token, ret); #endif + srcu_read_unlock(&kvm->irq_srcu, idx); return 0; fail: |
