summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig54
-rw-r--r--arch/x86/Makefile15
-rw-r--r--arch/x86/boot/code16gcc.h24
-rw-r--r--arch/x86/boot/compressed/aslr.c15
-rw-r--r--arch/x86/boot/compressed/eboot.c72
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S546
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c40
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S281
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S805
-rw-r--r--arch/x86/crypto/des3_ede_glue.c509
-rw-r--r--arch/x86/include/asm/Kbuild3
-rw-r--r--arch/x86/include/asm/acenv.h4
-rw-r--r--arch/x86/include/asm/acpi.h5
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/apic.h48
-rw-r--r--arch/x86/include/asm/barrier.h2
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/cmpxchg.h4
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h409
-rw-r--r--arch/x86/include/asm/crash.h9
-rw-r--r--arch/x86/include/asm/efi.h9
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/fpu-internal.h11
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/i8259.h5
-rw-r--r--arch/x86/include/asm/io_apic.h59
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h6
-rw-r--r--arch/x86/include/asm/kexec.h45
-rw-r--r--arch/x86/include/asm/kvm_emulate.h33
-rw-r--r--arch/x86/include/asm/kvm_host.h17
-rw-r--r--arch/x86/include/asm/mc146818rtc.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/mutex_32.h16
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/percpu.h3
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h78
-rw-r--r--arch/x86/include/asm/pmc_atom.h107
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/prom.h2
-rw-r--r--arch/x86/include/asm/qrwlock.h2
-rw-r--r--arch/x86/include/asm/scatterlist.h8
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h10
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h19
-rw-r--r--arch/x86/include/asm/vdso.h18
-rw-r--r--arch/x86/include/asm/vga.h6
-rw-r--r--arch/x86/include/asm/vmx.h7
-rw-r--r--arch/x86/include/asm/xsave.h223
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm_perf.h16
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h6
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/apei.c62
-rw-r--r--arch/x86/kernel/acpi/boot.c404
-rw-r--r--arch/x86/kernel/apic/apic.c75
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c16
-rw-r--r--arch/x86/kernel/apic/apic_noop.c23
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c8
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c14
-rw-r--r--arch/x86/kernel/apic/io_apic.c784
-rw-r--r--arch/x86/kernel/apic/probe_32.c33
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c348
-rw-r--r--arch/x86/kernel/cpu/common.c30
-rw-r--r--arch/x86/kernel/cpu/intel.c30
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c18
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c15
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c561
-rw-r--r--arch/x86/kernel/devicetree.c207
-rw-r--r--arch/x86/kernel/entry_32.S11
-rw-r--r--arch/x86/kernel/entry_64.S52
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/iosf_mbi.c2
-rw-r--r--arch/x86/kernel/irqinit.c14
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c553
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c250
-rw-r--r--arch/x86/kernel/mcount_64.S13
-rw-r--r--arch/x86/kernel/mpparse.c111
-rw-r--r--arch/x86/kernel/pmc_atom.c321
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/smpboot.c11
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/vsmp_64.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c23
-rw-r--r--arch/x86/kernel/xsave.c118
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c495
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/lapic.c56
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/pmu.c9
-rw-r--r--arch/x86/kvm/svm.c57
-rw-r--r--arch/x86/kvm/trace.h6
-rw-r--r--arch/x86/kvm/vmx.c241
-rw-r--r--arch/x86/kvm/x86.c235
-rw-r--r--arch/x86/kvm/x86.h27
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c9
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/tlb.c109
-rw-r--r--arch/x86/net/bpf_jit_comp.c16
-rw-r--r--arch/x86/pci/acpi.c6
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/intel_mid_pci.c27
-rw-r--r--arch/x86/pci/irq.c16
-rw-r--r--arch/x86/pci/xen.c7
-rw-r--r--arch/x86/platform/ce4100/ce4100.c11
-rw-r--r--arch/x86/platform/efi/Makefile2
-rw-r--r--arch/x86/platform/efi/efi.c483
-rw-r--r--arch/x86/platform/efi/quirks.c290
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_wdt.c22
-rw-r--r--arch/x86/platform/intel-mid/sfi.c56
-rw-r--r--arch/x86/platform/sfi/sfi.c10
-rw-r--r--arch/x86/platform/ts5500/ts5500.c94
-rw-r--r--arch/x86/platform/uv/tlb_uv.c71
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/purgatory/Makefile28
-rw-r--r--arch/x86/purgatory/entry64.S101
-rw-r--r--arch/x86/purgatory/purgatory.c72
-rw-r--r--arch/x86/purgatory/setup-x86_64.S58
-rw-r--r--arch/x86/purgatory/sha256.c283
-rw-r--r--arch/x86/purgatory/sha256.h22
-rw-r--r--arch/x86/purgatory/stack.S19
-rw-r--r--arch/x86/purgatory/string.c13
-rw-r--r--arch/x86/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/syscalls/syscall_64.tbl4
-rw-r--r--arch/x86/um/asm/elf.h1
-rw-r--r--arch/x86/um/asm/processor.h3
-rw-r--r--arch/x86/um/mem_64.c15
-rw-r--r--arch/x86/um/signal.c45
-rw-r--r--arch/x86/vdso/Makefile16
-rw-r--r--arch/x86/vdso/vdso-fakesections.c21
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S44
-rw-r--r--arch/x86/vdso/vdso2c.c128
-rw-r--r--arch/x86/vdso/vdso2c.h227
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vma.c22
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/efi.c43
-rw-r--r--arch/x86/xen/enlighten.c15
-rw-r--r--arch/x86/xen/grant-table.c70
-rw-r--r--arch/x86/xen/mmu.c27
-rw-r--r--arch/x86/xen/p2m.c5
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-ops.h8
179 files changed, 8309 insertions, 3463 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index e5287d8517aa..3942f74c92d7 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -16,3 +16,5 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
obj-y += net/
+
+obj-$(CONFIG_KEXEC_FILE) += purgatory/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d24887b645dc..36327438caf0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,9 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_FAST_MULTIPLIER
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
@@ -54,7 +56,6 @@ config X86
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_SYSCALL_TRACEPOINTS
select SYSCTL_EXCEPTION_TRACE
select HAVE_KVM
@@ -96,6 +97,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
@@ -109,9 +111,9 @@ config X86
select CLOCKSOURCE_WATCHDOG
select GENERIC_CLOCKEVENTS
select ARCH_CLOCKSOURCE_DATA
+ select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
select GENERIC_TIME_VSYSCALL
- select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_CONTEXT_TRACKING if X86_64
@@ -132,6 +134,9 @@ config X86
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER
def_bool y
@@ -430,6 +435,7 @@ config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
depends on PCI_GODIRECT
+ depends on X86_IO_APIC
depends on X86_32
depends on X86_EXTENDED_PLATFORM
select X86_REBOOTFIXUPS
@@ -836,6 +842,7 @@ config X86_IO_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
@@ -1523,6 +1530,7 @@ config EFI
bool "EFI runtime service support"
depends on ACPI
select UCS2_STRING
+ select EFI_RUNTIME_WRAPPERS
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1536,7 +1544,8 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI
+ depends on EFI && !X86_USE_3DNOW
+ select RELOCATABLE
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
@@ -1591,6 +1600,41 @@ config KEXEC
interface is strongly in flux, so no good recommendation can be
made.
+config KEXEC_FILE
+ bool "kexec file based system call"
+ select BUILD_BIN2C
+ depends on KEXEC
+ depends on X86_64
+ depends on CRYPTO=y
+ depends on CRYPTO_SHA256=y
+ ---help---
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by previous system call.
+
+config KEXEC_VERIFY_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on KEXEC_FILE
+ ---help---
+ This option makes kernel signature verification mandatory for
+ kexec_file_load() syscall. If kernel is signature can not be
+ verified, kexec_file_load() will fail.
+
+ This option enforces signature verification at generic level.
+ One needs to enable signature verification for type of kernel
+ image being loaded to make sure it works. For example, enable
+ bzImage signature verification option to be able to load and
+ verify signatures of bzImage. Otherwise kernel loading will fail.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on KEXEC_VERIFY_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ ---help---
+ Enable bzImage signature verification support.
+
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2404,6 +2448,10 @@ config IOSF_MBI
default m
depends on PCI
+config PMC_ATOM
+ def_bool y
+ depends on PCI
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 33f71b01fd22..60087ca37679 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -15,12 +15,9 @@ endif
# that way we can complain to the user if the CPU is insufficient.
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
-# older versions of GCC, we need to play evil and unreliable tricks to
-# attempt to ensure that our asm(".code16gcc") is first in the asm
-# output.
-CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time))
+# older versions of GCC, include an *assembly* header to make sure that
+# gcc doesn't play any games behind our back.
+CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
@@ -186,6 +183,11 @@ archscripts: scripts_basic
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+archprepare:
+ifeq ($(CONFIG_KEXEC_FILE),y)
+ $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
+endif
+
###
# Kernel objects
@@ -249,6 +251,7 @@ archclean:
$(Q)rm -rf $(objtree)/arch/x86_64
$(Q)$(MAKE) $(clean)=$(boot)
$(Q)$(MAKE) $(clean)=arch/x86/tools
+ $(Q)$(MAKE) $(clean)=arch/x86/purgatory
PHONY += kvmconfig
kvmconfig:
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
index d93e48010b61..5ff426535397 100644
--- a/arch/x86/boot/code16gcc.h
+++ b/arch/x86/boot/code16gcc.h
@@ -1,15 +1,11 @@
-/*
- * code16gcc.h
- *
- * This file is -include'd when compiling 16-bit C code.
- * Note: this asm() needs to be emitted before gcc emits any code.
- * Depending on gcc version, this requires -fno-unit-at-a-time or
- * -fno-toplevel-reorder.
- *
- * Hopefully gcc will eventually have a real -m16 option so we can
- * drop this hack long term.
- */
+#
+# code16gcc.h
+#
+# This file is added to the assembler via -Wa when compiling 16-bit C code.
+# This is done this way instead via asm() to make sure gcc does not reorder
+# things around us.
+#
+# gcc 4.9+ has a real -m16 option so we can drop this hack long term.
+#
-#ifndef __ASSEMBLY__
-asm(".code16gcc");
-#endif
+ .code16gcc
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index fc6091abedb7..d39189ba7f8e 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -183,12 +183,27 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
static bool mem_avoid_overlap(struct mem_vector *img)
{
int i;
+ struct setup_data *ptr;
for (i = 0; i < MEM_AVOID_MAX; i++) {
if (mem_overlaps(img, &mem_avoid[i]))
return true;
}
+ /* Avoid all entries in the setup_data linked list. */
+ ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
+ while (ptr) {
+ struct mem_vector avoid;
+
+ avoid.start = (u64)ptr;
+ avoid.size = sizeof(*ptr) + ptr->len;
+
+ if (mem_overlaps(img, &avoid))
+ return true;
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
return false;
}
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 0331d765c2bb..de8eebd6f67c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -48,8 +48,7 @@ static void setup_boot_services##bits(struct efi_config *c) \
BOOT_SERVICES(32);
BOOT_SERVICES(64);
-static void efi_printk(efi_system_table_t *, char *);
-static void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
+void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
static efi_status_t
__file_size32(void *__fh, efi_char16_t *filename_16,
@@ -156,7 +155,7 @@ grow:
return status;
}
-static efi_status_t
+efi_status_t
efi_file_size(efi_system_table_t *sys_table, void *__fh,
efi_char16_t *filename_16, void **handle, u64 *file_sz)
{
@@ -166,7 +165,7 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh,
return __file_size32(__fh, filename_16, handle, file_sz);
}
-static inline efi_status_t
+efi_status_t
efi_file_read(void *handle, unsigned long *size, void *addr)
{
unsigned long func;
@@ -184,7 +183,7 @@ efi_file_read(void *handle, unsigned long *size, void *addr)
}
}
-static inline efi_status_t efi_file_close(void *handle)
+efi_status_t efi_file_close(void *handle)
{
if (efi_early->is64) {
efi_file_handle_64_t *fh = handle;
@@ -249,7 +248,7 @@ static inline efi_status_t __open_volume64(void *__image, void **__fh)
return status;
}
-static inline efi_status_t
+efi_status_t
efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
{
if (efi_early->is64)
@@ -258,7 +257,7 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
return __open_volume32(__image, __fh);
}
-static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
{
unsigned long output_string;
size_t offset;
@@ -269,22 +268,24 @@ static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
offset = offsetof(typeof(*out), output_string);
output_string = efi_early->text_output + offset;
+ out = (typeof(out))(unsigned long)efi_early->text_output;
func = (u64 *)output_string;
- efi_early->call(*func, efi_early->text_output, str);
+ efi_early->call(*func, out, str);
} else {
struct efi_simple_text_output_protocol_32 *out;
u32 *func;
offset = offsetof(typeof(*out), output_string);
output_string = efi_early->text_output + offset;
+ out = (typeof(out))(unsigned long)efi_early->text_output;
func = (u32 *)output_string;
- efi_early->call(*func, efi_early->text_output, str);
+ efi_early->call(*func, out, str);
}
}
-#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
+#include "../../../../drivers/firmware/efi/libstub/efi-stub-helper.c"
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
{
@@ -366,7 +367,7 @@ free_struct:
return status;
}
-static efi_status_t
+static void
setup_efi_pci32(struct boot_params *params, void **pci_handle,
unsigned long size)
{
@@ -409,8 +410,6 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
data = (struct setup_data *)rom;
}
-
- return status;
}
static efi_status_t
@@ -469,7 +468,7 @@ free_struct:
}
-static efi_status_t
+static void
setup_efi_pci64(struct boot_params *params, void **pci_handle,
unsigned long size)
{
@@ -512,11 +511,18 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle,
data = (struct setup_data *)rom;
}
-
- return status;
}
-static efi_status_t setup_efi_pci(struct boot_params *params)
+/*
+ * There's no way to return an informative status from this function,
+ * because any analysis (and printing of error messages) needs to be
+ * done directly at the EFI function call-site.
+ *
+ * For example, EFI_INVALID_PARAMETER could indicate a bug or maybe we
+ * just didn't find any PCI devices, but there's no way to tell outside
+ * the context of the call.
+ */
+static void setup_efi_pci(struct boot_params *params)
{
efi_status_t status;
void **pci_handle = NULL;
@@ -533,7 +539,7 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
size, (void **)&pci_handle);
if (status != EFI_SUCCESS)
- return status;
+ return;
status = efi_call_early(locate_handle,
EFI_LOCATE_BY_PROTOCOL, &pci_proto,
@@ -544,13 +550,12 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
goto free_handle;
if (efi_early->is64)
- status = setup_efi_pci64(params, pci_handle, size);
+ setup_efi_pci64(params, pci_handle, size);
else
- status = setup_efi_pci32(params, pci_handle, size);
+ setup_efi_pci32(params, pci_handle, size);
free_handle:
efi_call_early(free_pool, pci_handle);
- return status;
}
static void
@@ -1104,10 +1109,22 @@ struct boot_params *make_boot_params(struct efi_config *c)
(char *)(unsigned long)hdr->cmd_line_ptr,
"initrd=", hdr->initrd_addr_max,
&ramdisk_addr, &ramdisk_size);
+
+ if (status != EFI_SUCCESS &&
+ hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) {
+ efi_printk(sys_table, "Trying to load files to higher address\n");
+ status = handle_cmdline_files(sys_table, image,
+ (char *)(unsigned long)hdr->cmd_line_ptr,
+ "initrd=", -1UL,
+ &ramdisk_addr, &ramdisk_size);
+ }
+
if (status != EFI_SUCCESS)
goto fail2;
- hdr->ramdisk_image = ramdisk_addr;
- hdr->ramdisk_size = ramdisk_size;
+ hdr->ramdisk_image = ramdisk_addr & 0xffffffff;
+ hdr->ramdisk_size = ramdisk_size & 0xffffffff;
+ boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32;
+ boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
return boot_params;
fail2:
@@ -1401,16 +1418,20 @@ struct boot_params *efi_main(struct efi_config *c,
hdr->init_size, hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
goto fail;
+ }
hdr->pref_address = hdr->code32_start;
hdr->code32_start = bzimage_addr;
}
status = exit_boot(boot_params, handle, is64);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "exit_boot() failed!\n");
goto fail;
+ }
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
@@ -1470,5 +1491,6 @@ struct boot_params *efi_main(struct efi_config *c,
return boot_params;
fail:
+ efi_printk(sys_table, "efi_main() failed!\n");
return NULL;
}
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7a6d43a554d7..16ef02596db2 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -154,7 +154,7 @@ extra_header_fields:
#else
.quad 0 # ImageBase
#endif
- .long 0x20 # SectionAlignment
+ .long CONFIG_PHYSICAL_ALIGN # SectionAlignment
.long 0x20 # FileAlignment
.word 0 # MajorOperatingSystemVersion
.word 0 # MinorOperatingSystemVersion
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 61d6e281898b..d551165a3159 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
@@ -52,6 +53,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
@@ -76,7 +78,7 @@ ifeq ($(avx2_supported),yes)
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 000000000000..f091f122ed24
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,546 @@
+/*
+ * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define CONCAT(a,b) a##b
+#define VMOVDQ vmovdqu
+
+#define xdata0 %xmm0
+#define xdata1 %xmm1
+#define xdata2 %xmm2
+#define xdata3 %xmm3
+#define xdata4 %xmm4
+#define xdata5 %xmm5
+#define xdata6 %xmm6
+#define xdata7 %xmm7
+#define xcounter %xmm8
+#define xbyteswap %xmm9
+#define xkey0 %xmm10
+#define xkey3 %xmm11
+#define xkey6 %xmm12
+#define xkey9 %xmm13
+#define xkey4 %xmm11
+#define xkey8 %xmm12
+#define xkey12 %xmm13
+#define xkeyA %xmm14
+#define xkeyB %xmm15
+
+#define p_in %rdi
+#define p_iv %rsi
+#define p_keys %rdx
+#define p_out %rcx
+#define num_bytes %r8
+
+#define tmp %r10
+#define DDQ(i) CONCAT(ddq_add_,i)
+#define XMM(i) CONCAT(%xmm, i)
+#define DDQ_DATA 0
+#define XDATA 1
+#define KEY_128 1
+#define KEY_192 2
+#define KEY_256 3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+ddq_add_1:
+ .octa 0x00000000000000000000000000000001
+ddq_add_2:
+ .octa 0x00000000000000000000000000000002
+ddq_add_3:
+ .octa 0x00000000000000000000000000000003
+ddq_add_4:
+ .octa 0x00000000000000000000000000000004
+ddq_add_5:
+ .octa 0x00000000000000000000000000000005
+ddq_add_6:
+ .octa 0x00000000000000000000000000000006
+ddq_add_7:
+ .octa 0x00000000000000000000000000000007
+ddq_add_8:
+ .octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+.macro setddq n
+ var_ddq_add = DDQ(\n)
+.endm
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+ var_xdata = XMM(\n)
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+ .if \name == DDQ_DATA
+ setddq %\id
+ .elseif \name == XDATA
+ setxdata %\id
+ .endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+ .set by, \b
+ .set load_keys, \k
+ .set klen, \key_len
+
+ .if (load_keys)
+ vmovdqa 0*16(p_keys), xkey0
+ .endif
+
+ vpshufb xbyteswap, xcounter, xdata0
+
+ .set i, 1
+ .rept (by - 1)
+ club DDQ_DATA, i
+ club XDATA, i
+ vpaddd var_ddq_add(%rip), xcounter, var_xdata
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 1*16(p_keys), xkeyA
+
+ vpxor xkey0, xdata0, xdata0
+ club DDQ_DATA, by
+ vpaddd var_ddq_add(%rip), xcounter, xcounter
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpxor xkey0, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 2*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
+ .set i, (i +1)
+ .endr
+
+ add $(16*by), p_in
+
+ .if (klen == KEY_128)
+ vmovdqa 4*16(p_keys), xkey4
+ .else
+ .if (load_keys)
+ vmovdqa 4*16(p_keys), xkey4
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 5*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey4, var_xdata, var_xdata /* key 4 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+ .else
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 7*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ vmovdqa 8*16(p_keys), xkey8
+ .else
+ .if (load_keys)
+ vmovdqa 8*16(p_keys), xkey8
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey8, var_xdata, var_xdata /* key 8 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 10*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ vmovdqa 11*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 10 */
+ .if (klen == KEY_128)
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ .if (load_keys)
+ vmovdqa 12*16(p_keys), xkey12
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 13*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ .if (klen == KEY_256)
+ /* key 12 */
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenclast xkey12, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 14*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 13 */
+ vaesenc xkeyA, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 14 */
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
+ .endif
+
+ .set i, 0
+ .rept (by / 2)
+ .set j, (i+1)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ VMOVDQ (j*16 - 16*by)(p_in), xkeyB
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ club XDATA, j
+ vpxor xkeyB, var_xdata, var_xdata
+ .set i, (i+2)
+ .endr
+
+ .if (i < by)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ VMOVDQ var_xdata, i*16(p_out)
+ .set i, (i+1)
+ .endr
+.endm
+
+.macro do_aes_load val, key_len
+ do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+ do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+
+ cmp $16, num_bytes
+ jb .Ldo_return2\key_len
+
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+
+ mov num_bytes, tmp
+ and $(7*16), tmp
+ jz .Lmult_of_8_blks\key_len
+
+ /* 1 <= tmp <= 7 */
+ cmp $(4*16), tmp
+ jg .Lgt4\key_len
+ je .Leq4\key_len
+
+.Llt4\key_len:
+ cmp $(2*16), tmp
+ jg .Leq3\key_len
+ je .Leq2\key_len
+
+.Leq1\key_len:
+ do_aes_load 1, \key_len
+ add $(1*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq2\key_len:
+ do_aes_load 2, \key_len
+ add $(2*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+ do_aes_load 3, \key_len
+ add $(3*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq4\key_len:
+ do_aes_load 4, \key_len
+ add $(4*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lgt4\key_len:
+ cmp $(6*16), tmp
+ jg .Leq7\key_len
+ je .Leq6\key_len
+
+.Leq5\key_len:
+ do_aes_load 5, \key_len
+ add $(5*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq6\key_len:
+ do_aes_load 6, \key_len
+ add $(6*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq7\key_len:
+ do_aes_load 7, \key_len
+ add $(7*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+ .if (\key_len != KEY_128)
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 12*16(p_keys), xkey12
+ .else
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 3*16(p_keys), xkey4
+ vmovdqa 6*16(p_keys), xkey8
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+.align 16
+.Lmain_loop2\key_len:
+ /* num_bytes is a multiple of 8 and >0 */
+ do_aes_noload 8, \key_len
+ add $(8*16), p_out
+ sub $(8*16), num_bytes
+ jne .Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 948ad0e77741..a7ccd57f19e4 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -105,6 +105,9 @@ void crypto_fpu_exit(void);
#define AVX_GEN4_OPTSIZE 4096
#ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
#ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
/*
* asmlinkage void aesni_gcm_precomp_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
+#if 0 /* temporary disabled due to failing crypto tests */
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv)
+{
+ /*
+ * based on key length, override with the by8 version
+ * of ctr mode encryption/decryption for improved performance
+ * aes_set_key_common() ensures that key length is one of
+ * {128,192,256}
+ */
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+ else
+ aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
static int ctr_crypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
@@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc,
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
+ aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
@@ -1493,6 +1521,14 @@ static int __init aesni_init(void)
aesni_gcm_enc_tfm = aesni_gcm_enc;
aesni_gcm_dec_tfm = aesni_gcm_dec;
}
+ aesni_ctr_enc_tfm = aesni_ctr_enc;
+#if 0 /* temporary disabled due to failing crypto tests */
+ if (cpu_has_avx) {
+ /* optimize performance of ctr mode encryption transform */
+ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ pr_info("AES CTR mode by8 optimization enabled\n");
+ }
+#endif
#endif
err = crypto_fpu_init();
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dbc4339b5417..26d49ebae040 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -72,6 +72,7 @@
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+.text
ENTRY(crc_pcl)
#define bufp %rdi
#define bufp_dw %edi
@@ -216,15 +217,11 @@ LABEL crc_ %i
## 4) Combine three results:
################################################################
- lea (K_table-16)(%rip), bufp # first entry is for idx 1
+ lea (K_table-8)(%rip), bufp # first entry is for idx 1
shlq $3, %rax # rax *= 8
- subq %rax, tmp # tmp -= rax*8
- shlq $1, %rax
- subq %rax, tmp # tmp -= rax*16
- # (total tmp -= rax*24)
- addq %rax, bufp
-
- movdqa (bufp), %xmm0 # 2 consts: K1:K2
+ pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
+ subq %rax, tmp # tmp -= rax*24
movq crc_init, %xmm1 # CRC for block 1
PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
@@ -238,9 +235,9 @@ LABEL crc_ %i
mov crc2, crc_init
crc32 %rax, crc_init
-################################################################
-## 5) Check for end:
-################################################################
+ ################################################################
+ ## 5) Check for end:
+ ################################################################
LABEL crc_ 0
mov tmp, len
@@ -331,136 +328,136 @@ ENDPROC(crc_pcl)
################################################################
## PCLMULQDQ tables
- ## Table is 128 entries x 2 quad words each
+ ## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.data
-.align 64
+.section .rotata, "a", %progbits
+.align 8
K_table:
- .quad 0x14cd00bd6,0x105ec76f0
- .quad 0x0ba4fc28e,0x14cd00bd6
- .quad 0x1d82c63da,0x0f20c0dfe
- .quad 0x09e4addf8,0x0ba4fc28e
- .quad 0x039d3b296,0x1384aa63a
- .quad 0x102f9b8a2,0x1d82c63da
- .quad 0x14237f5e6,0x01c291d04
- .quad 0x00d3b6092,0x09e4addf8
- .quad 0x0c96cfdc0,0x0740eef02
- .quad 0x18266e456,0x039d3b296
- .quad 0x0daece73e,0x0083a6eec
- .quad 0x0ab7aff2a,0x102f9b8a2
- .quad 0x1248ea574,0x1c1733996
- .quad 0x083348832,0x14237f5e6
- .quad 0x12c743124,0x02ad91c30
- .quad 0x0b9e02b86,0x00d3b6092
- .quad 0x018b33a4e,0x06992cea2
- .quad 0x1b331e26a,0x0c96cfdc0
- .quad 0x17d35ba46,0x07e908048
- .quad 0x1bf2e8b8a,0x18266e456
- .quad 0x1a3e0968a,0x11ed1f9d8
- .quad 0x0ce7f39f4,0x0daece73e
- .quad 0x061d82e56,0x0f1d0f55e
- .quad 0x0d270f1a2,0x0ab7aff2a
- .quad 0x1c3f5f66c,0x0a87ab8a8
- .quad 0x12ed0daac,0x1248ea574
- .quad 0x065863b64,0x08462d800
- .quad 0x11eef4f8e,0x083348832
- .quad 0x1ee54f54c,0x071d111a8
- .quad 0x0b3e32c28,0x12c743124
- .quad 0x0064f7f26,0x0ffd852c6
- .quad 0x0dd7e3b0c,0x0b9e02b86
- .quad 0x0f285651c,0x0dcb17aa4
- .quad 0x010746f3c,0x018b33a4e
- .quad 0x1c24afea4,0x0f37c5aee
- .quad 0x0271d9844,0x1b331e26a
- .quad 0x08e766a0c,0x06051d5a2
- .quad 0x093a5f730,0x17d35ba46
- .quad 0x06cb08e5c,0x11d5ca20e
- .quad 0x06b749fb2,0x1bf2e8b8a
- .quad 0x1167f94f2,0x021f3d99c
- .quad 0x0cec3662e,0x1a3e0968a
- .quad 0x19329634a,0x08f158014
- .quad 0x0e6fc4e6a,0x0ce7f39f4
- .quad 0x08227bb8a,0x1a5e82106
- .quad 0x0b0cd4768,0x061d82e56
- .quad 0x13c2b89c4,0x188815ab2
- .quad 0x0d7a4825c,0x0d270f1a2
- .quad 0x10f5ff2ba,0x105405f3e
- .quad 0x00167d312,0x1c3f5f66c
- .quad 0x0f6076544,0x0e9adf796
- .quad 0x026f6a60a,0x12ed0daac
- .quad 0x1a2adb74e,0x096638b34
- .quad 0x19d34af3a,0x065863b64
- .quad 0x049c3cc9c,0x1e50585a0
- .quad 0x068bce87a,0x11eef4f8e
- .quad 0x1524fa6c6,0x19f1c69dc
- .quad 0x16cba8aca,0x1ee54f54c
- .quad 0x042d98888,0x12913343e
- .quad 0x1329d9f7e,0x0b3e32c28
- .quad 0x1b1c69528,0x088f25a3a
- .quad 0x02178513a,0x0064f7f26
- .quad 0x0e0ac139e,0x04e36f0b0
- .quad 0x0170076fa,0x0dd7e3b0c
- .quad 0x141a1a2e2,0x0bd6f81f8
- .quad 0x16ad828b4,0x0f285651c
- .quad 0x041d17b64,0x19425cbba
- .quad 0x1fae1cc66,0x010746f3c
- .quad 0x1a75b4b00,0x18db37e8a
- .quad 0x0f872e54c,0x1c24afea4
- .quad 0x01e41e9fc,0x04c144932
- .quad 0x086d8e4d2,0x0271d9844
- .quad 0x160f7af7a,0x052148f02
- .quad 0x05bb8f1bc,0x08e766a0c
- .quad 0x0a90fd27a,0x0a3c6f37a
- .quad 0x0b3af077a,0x093a5f730
- .quad 0x04984d782,0x1d22c238e
- .quad 0x0ca6ef3ac,0x06cb08e5c
- .quad 0x0234e0b26,0x063ded06a
- .quad 0x1d88abd4a,0x06b749fb2
- .quad 0x04597456a,0x04d56973c
- .quad 0x0e9e28eb4,0x1167f94f2
- .quad 0x07b3ff57a,0x19385bf2e
- .quad 0x0c9c8b782,0x0cec3662e
- .quad 0x13a9cba9e,0x0e417f38a
- .quad 0x093e106a4,0x19329634a
- .quad 0x167001a9c,0x14e727980
- .quad 0x1ddffc5d4,0x0e6fc4e6a
- .quad 0x00df04680,0x0d104b8fc
- .quad 0x02342001e,0x08227bb8a
- .quad 0x00a2a8d7e,0x05b397730
- .quad 0x168763fa6,0x0b0cd4768
- .quad 0x1ed5a407a,0x0e78eb416
- .quad 0x0d2c3ed1a,0x13c2b89c4
- .quad 0x0995a5724,0x1641378f0
- .quad 0x19b1afbc4,0x0d7a4825c
- .quad 0x109ffedc0,0x08d96551c
- .quad 0x0f2271e60,0x10f5ff2ba
- .quad 0x00b0bf8ca,0x00bf80dd2
- .quad 0x123888b7a,0x00167d312
- .quad 0x1e888f7dc,0x18dcddd1c
- .quad 0x002ee03b2,0x0f6076544
- .quad 0x183e8d8fe,0x06a45d2b2
- .quad 0x133d7a042,0x026f6a60a
- .quad 0x116b0f50c,0x1dd3e10e8
- .quad 0x05fabe670,0x1a2adb74e
- .quad 0x130004488,0x0de87806c
- .quad 0x000bcf5f6,0x19d34af3a
- .quad 0x18f0c7078,0x014338754
- .quad 0x017f27698,0x049c3cc9c
- .quad 0x058ca5f00,0x15e3e77ee
- .quad 0x1af900c24,0x068bce87a
- .quad 0x0b5cfca28,0x0dd07448e
- .quad 0x0ded288f8,0x1524fa6c6
- .quad 0x059f229bc,0x1d8048348
- .quad 0x06d390dec,0x16cba8aca
- .quad 0x037170390,0x0a3e3e02c
- .quad 0x06353c1cc,0x042d98888
- .quad 0x0c4584f5c,0x0d73c7bea
- .quad 0x1f16a3418,0x1329d9f7e
- .quad 0x0531377e2,0x185137662
- .quad 0x1d8d9ca7c,0x1b1c69528
- .quad 0x0b25b29f2,0x18a08b5bc
- .quad 0x19fb2a8b0,0x02178513a
- .quad 0x1a08fe6ac,0x1da758ae0
- .quad 0x045cddf4e,0x0e0ac139e
- .quad 0x1a91647f2,0x169cf9eb0
- .quad 0x1a0f717c4,0x0170076fa
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 000000000000..038f6ae87c5e
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,805 @@
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rbp
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %ebp
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ movq s8(, RT0, 8), RT0; \
+ xorq s6(, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s4(, RT2, 8), RT0; \
+ xorq s2(, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ xorq s7(, RL1, 8), RT0; \
+ xorq s5(, RT1, 8), to; \
+ xorq s3(, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s8(, RT3, 8), to##0; \
+ xorq s6(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s4(, RT3, 8), to##0; \
+ xorq s2(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s7(, RT3, 8), to##0; \
+ xorq s5(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ xorq s3(, RT3, 8), to##0; \
+ xorq s1(, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s8(, RT3, 8), to##1; \
+ xorq s6(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s4(, RT3, 8), to##1; \
+ xorq s2(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ xorq s7(, RT3, 8), to##1; \
+ xorq s5(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ xorq s3(, RT3, 8), to##1; \
+ xorq s1(, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s8(, RT3, 8), to##2; \
+ xorq s6(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s4(, RT3, 8), to##2; \
+ xorq s2(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ xorq s7(, RT3, 8), to##2; \
+ xorq s5(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ xorq s3(, RT3, 8), to##2; \
+ xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.data
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 000000000000..0e9c0668fe4e
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,509 @@
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/processor.h>
+#include <crypto/des.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+struct des3_ede_x86_ctx {
+ u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
+ u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ const u32 *expkey)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->enc_expkey);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->dec_expkey);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+ struct blkcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[DES3_EDE_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ des3_ede_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor(keystream, src, nbytes);
+ memcpy(dst, keystream, nbytes);
+
+ crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ __be64 *src = (__be64 *)walk->src.virt.addr;
+ __be64 *dst = (__be64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[3];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+ (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+ dst[1] = src[1] ^ ctrblocks[1];
+ dst[2] = src[2] ^ ctrblocks[2];
+
+ src += 3;
+ dst += 3;
+ } while ((nbytes -= bsize * 3) >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE);
+
+ while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ if (walk.nbytes) {
+ ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ /* Generate encryption context using generic implementation. */
+ err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
+ if (err < 0)
+ return err;
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc_expkey[i + 1], 4);
+ ctx->enc_expkey[i + 1] = tmp;
+
+ ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
+ ctx->dec_expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static struct crypto_alg des3_ede_algs[4] = { {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+}, {
+ .cra_name = "ecb(des3_ede)",
+ .cra_driver_name = "ecb-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+ },
+}, {
+ .cra_name = "cbc(des3_ede)",
+ .cra_driver_name = "cbc-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+ },
+}, {
+ .cra_name = "ctr(des3_ede)",
+ .cra_driver_name = "ctr-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+ },
+} };
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS("des3_ede");
+MODULE_ALIAS("des3_ede-asm");
+MODULE_ALIAS("des");
+MODULE_ALIAS("des-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 3ca9762e1649..3bf000fab0ae 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,6 +5,7 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
-generic-y += early_ioremap.h
generic-y += cputime.h
+generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
index 66873297e9f5..1b010a859b8b 100644
--- a/arch/x86/include/asm/acenv.h
+++ b/arch/x86/include/asm/acenv.h
@@ -18,8 +18,6 @@
#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-#ifdef CONFIG_ACPI
-
int __acpi_acquire_global_lock(unsigned int *lock);
int __acpi_release_global_lock(unsigned int *lock);
@@ -44,6 +42,4 @@ int __acpi_release_global_lock(unsigned int *lock);
: "=r"(n_hi), "=r"(n_lo) \
: "0"(n_hi), "1"(n_lo))
-#endif
-
#endif /* _ASM_X86_ACENV_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index e06225eda635..0ab4f9fd2687 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -121,6 +121,11 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
}
+static inline bool acpi_has_cpu_in_madt(void)
+{
+ return !!acpi_lapic;
+}
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d5..473bdbee378a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: : "i" (0), ## input)
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
+ feature2, input...) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2) \
+ : : "i" (0), ## input)
+
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 19b0ebafcd3e..465b309af254 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -85,21 +85,13 @@ static inline bool apic_from_smp_config(void)
#include <asm/paravirt.h>
#endif
-#ifdef CONFIG_X86_64
-extern int is_vsmp_box(void);
-#else
-static inline int is_vsmp_box(void)
-{
- return 0;
-}
-#endif
extern int setup_profiling_timer(unsigned int);
static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
@@ -300,7 +292,6 @@ struct apic {
int dest_logical;
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
- unsigned long (*check_apicid_present)(int apicid);
void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
const struct cpumask *mask);
@@ -309,21 +300,11 @@ struct apic {
void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
void (*setup_apic_routing)(void);
- int (*multi_timer_check)(int apic, int irq);
int (*cpu_present_to_apicid)(int mps_cpu);
void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
- void (*setup_portio_remap)(void);
int (*check_phys_apicid_present)(int phys_apicid);
- void (*enable_apic_mode)(void);
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
- /*
- * When one of the next two hooks returns 1 the apic
- * is switched to this. Essentially they are additional
- * probe functions:
- */
- int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
-
unsigned int (*get_apic_id)(unsigned long x);
unsigned long (*set_apic_id)(unsigned int id);
unsigned long apic_id_mask;
@@ -343,11 +324,7 @@ struct apic {
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
- int trampoline_phys_low;
- int trampoline_phys_high;
-
bool wait_for_init_deassert;
- void (*smp_callin_clear_local_apic)(void);
void (*inquire_remote_apic)(int apicid);
/* apic ops */
@@ -378,14 +355,6 @@ struct apic {
* won't be applied properly during early boot in this case.
*/
int (*x86_32_early_logical_apicid)(int cpu);
-
- /*
- * Optional method called from setup_local_APIC() after logical
- * apicid is guaranteed to be known to initialize apicid -> node
- * mapping if NUMA initialization hasn't done so already. Don't
- * add new users.
- */
- int (*x86_32_numa_cpu_node)(int cpu);
#endif
};
@@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x)
}
/*
- * Warm reset vector default position:
+ * Warm reset vector position:
*/
-#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
-#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
+#define TRAMPOLINE_PHYS_LOW 0x467
+#define TRAMPOLINE_PHYS_HIGH 0x469
#ifdef CONFIG_X86_64
-extern int default_acpi_madt_oem_check(char *, char *);
-
extern void apic_send_IPI_self(int vector);
DECLARE_PER_CPU(int, x2apic_extra_bits);
@@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid)
return (apicid < 255);
}
+extern int default_acpi_madt_oem_check(char *, char *);
+
extern void default_setup_apic_routing(void);
extern struct apic apic_noop;
@@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap
return physid_isset(apicid, *map);
}
-static inline unsigned long default_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
*retmap = *phys_map;
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 5c7198cca5ed..0f4460b5636d 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -99,7 +99,7 @@
#if defined(CONFIG_X86_PPRO_FENCE)
/*
- * For either of these options x86 doesn't have a strong TSO memory
+ * For this option x86 doesn't have a strong TSO memory
* model and we should fall back to full barriers.
*/
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index afcd35d331de..cfe3b954d5e4 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -497,8 +497,6 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/sched.h>
-#define ARCH_HAS_FAST_MULTIPLIER 1
-
#include <asm/arch_hweight.h>
#include <asm-generic/bitops/const_hweight.h>
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d47786acb016..99c105d78b7e 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,6 +4,8 @@
#include <linux/compiler.h>
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+#define __HAVE_ARCH_CMPXCHG 1
+
/*
* Non-existant functions to indicate usage errors at link time
* (or compile-time if the compiler implements __compiletime_error().
@@ -143,7 +145,6 @@ extern void __add_wrong_size(void)
# include <asm/cmpxchg_64.h>
#endif
-#ifdef __HAVE_ARCH_CMPXCHG
#define cmpxchg(ptr, old, new) \
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
@@ -152,7 +153,6 @@ extern void __add_wrong_size(void)
#define cmpxchg_local(ptr, old, new) \
__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
-#endif
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f8bf2eecab86..f7e142926481 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -34,8 +34,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
: "memory");
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#ifdef CONFIG_X86_CMPXCHG64
#define cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 614be87f1a9b..1af94697aae5 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -6,8 +6,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
*ptr = val;
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#define cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e265ff95d16d..bb9b258d60e7 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,7 +8,7 @@
#include <asm/required-features.h>
#endif
-#define NCAPINTS 10 /* N 32-bit words worth of info */
+#define NCAPINTS 11 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -18,213 +18,218 @@
*/
/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
/* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM (0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
+#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
/* cpu types for specific tunings: */
-#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID (4*32+10) /* Context ID */
-#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES (4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_PERFCTR_L2 (6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
/*
* Auxiliary flags: Linux defined - For features scattered in various
* CPUID levels like 0x6, 0xA etc, word 7
*/
-#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
-#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
-#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
-#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
/*
* BUG word(s)
@@ -234,8 +239,11 @@
#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* AMD Erratum 400 */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -245,6 +253,12 @@
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
+/*
+ * In order to save room, we index into this array by doing
+ * X86_BUG_<name> - NCAPINTS*32.
+ */
+extern const char * const x86_bug_flags[NBUGINTS*32];
+
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -301,7 +315,6 @@ extern const char * const x86_power_flags[32];
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
@@ -328,6 +341,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT)
+#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
@@ -347,9 +361,6 @@ extern const char * const x86_power_flags[32];
#undef cpu_has_pae
#define cpu_has_pae ___BUG___
-#undef cpu_has_mp
-#define cpu_has_mp 1
-
#undef cpu_has_k6_mtrr
#define cpu_has_k6_mtrr 0
@@ -539,20 +550,20 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
#define static_cpu_has_safe(bit) boot_cpu_has(bit)
#endif
-#define cpu_has_bug(c, bit) cpu_has(c, (bit))
-#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
-#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit));
+#define cpu_has_bug(c, bit) cpu_has(c, (bit))
+#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
+#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
-#define static_cpu_has_bug(bit) static_cpu_has((bit))
-#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define static_cpu_has_bug(bit) static_cpu_has((bit))
+#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit))
+#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
-#define MAX_CPU_FEATURES (NCAPINTS * 32)
-#define cpu_have_feature boot_cpu_has
+#define MAX_CPU_FEATURES (NCAPINTS * 32)
+#define cpu_have_feature boot_cpu_has
-#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
-#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
- boot_cpu_data.x86_model
+#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
+#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
+ boot_cpu_data.x86_model
#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
-
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 000000000000..f498411f2500
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+int crash_load_segments(struct kimage *image);
+int crash_copy_backup_region(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 1eb5f6433ad8..0ec241ede5a2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -104,6 +104,8 @@ extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_mkexec(void);
extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
+extern int __init efi_reuse_config(u64 tables, int nr_tables);
+extern void efi_delete_dummy_variable(void);
struct efi_setup_data {
u64 fw_vendor;
@@ -156,6 +158,9 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
return EFI_SUCCESS;
}
#endif /* CONFIG_EFI_MIXED */
+
+extern bool efi_reboot_required(void);
+
#else
/*
* IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -168,6 +173,10 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
+static inline bool efi_reboot_required(void)
+{
+ return false;
+}
#endif /* CONFIG_EFI */
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b0910f97a3ea..ffb1733ac91f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -106,14 +106,14 @@ enum fixed_addresses {
__end_of_permanent_fixed_addresses,
/*
- * 256 temporary boot-time mappings, used by early_ioremap(),
+ * 512 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * If necessary we round it up to the next 256 pages boundary so
+ * If necessary we round it up to the next 512 pages boundary so
* that we can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
+#define FIX_BTMAPS_SLOTS 8
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
FIX_BTMAP_END =
(__end_of_permanent_fixed_addresses ^
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 115e3689cd53..412ececa00b9 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. "m" is a random variable that should be in L1 */
- if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) {
+ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
"emms\n\t"
@@ -508,9 +508,12 @@ static inline void user_fpu_begin(void)
static inline void __save_fpu(struct task_struct *tsk)
{
- if (use_xsave())
- xsave_state(&tsk->thread.fpu.state->xsave, -1);
- else
+ if (use_xsave()) {
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
+ else
+ xsave_state(&tsk->thread.fpu.state->xsave, -1);
+ } else
fpu_fxsave(&tsk->thread.fpu);
}
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 0525a8bdf65d..e1f7fecaa7d6 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -68,6 +68,8 @@ struct dyn_arch_ftrace {
int ftrace_int3_handler(struct pt_regs *regs);
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
+
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 230853da4ec0..0f5fb6b6567e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -40,9 +40,6 @@ typedef struct {
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
#define __ARCH_IRQ_STAT
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index a20365953bf8..ccffa53750a8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -67,4 +67,9 @@ struct legacy_pic {
extern struct legacy_pic *legacy_pic;
extern struct legacy_pic null_legacy_pic;
+static inline int nr_legacy_irqs(void)
+{
+ return legacy_pic->nr_legacy_irqs;
+}
+
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 90f97b4b9347..1733ab49ac5e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry {
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
+#define IOAPIC_MAP_ALLOC 0x1
+#define IOAPIC_MAP_CHECK 0x2
#ifdef CONFIG_X86_IO_APIC
@@ -118,9 +120,6 @@ extern int mp_irq_entries;
/* MP IRQ source entries */
extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
/* Older SiS APIC requires we rewrite the index register */
extern int sis_apic_bug;
@@ -133,9 +132,6 @@ extern int noioapicquirk;
/* -1 if "noapic" boot option passed */
extern int noioapicreroute;
-/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
-extern int timer_through_8259;
-
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -145,24 +141,17 @@ extern int timer_through_8259;
struct io_apic_irq_attr;
struct irq_cfg;
-extern int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr);
-void setup_IO_APIC_irq_extra(u32 gsi);
extern void ioapic_insert_resources(void);
extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
unsigned int, int,
struct io_apic_irq_attr *);
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
- unsigned int, int,
- struct io_apic_irq_attr *);
extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
extern void native_compose_msi_msg(struct pci_dev *pdev,
unsigned int irq, unsigned int dest,
struct msi_msg *msg, u8 hpet_id);
extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
extern int save_ioapic_entries(void);
extern void mask_ioapic_entries(void);
@@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void);
extern void setup_ioapic_ids_from_mpc(void);
extern void setup_ioapic_ids_from_mpc_nocheck(void);
+enum ioapic_domain_type {
+ IOAPIC_DOMAIN_INVALID,
+ IOAPIC_DOMAIN_LEGACY,
+ IOAPIC_DOMAIN_STRICT,
+ IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct device_node;
+struct irq_domain;
+struct irq_domain_ops;
+
+struct ioapic_domain_cfg {
+ enum ioapic_domain_type type;
+ const struct irq_domain_ops *ops;
+ struct device_node *dev;
+};
+
struct mp_ioapic_gsi{
u32 gsi_base;
u32 gsi_end;
};
-extern struct mp_ioapic_gsi mp_gsi_routing[];
extern u32 gsi_top;
-int mp_find_ioapic(u32 gsi);
-int mp_find_ioapic_pin(int ioapic, u32 gsi);
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern u32 mp_pin_to_gsi(int ioapic, int pin);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern void mp_unmap_irq(int irq);
+extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg);
+extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq);
+extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
+extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
extern void __init pre_init_apic_IRQ0(void);
extern void mp_save_irq(struct mpc_intsrc *m);
@@ -213,18 +227,19 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
extern void io_apic_eoi(unsigned int apic, unsigned int vector);
+extern bool mp_should_keep_irq(struct device *dev);
+
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
-static const int timer_through_8259 = 0;
static inline void ioapic_insert_resources(void) { }
#define gsi_top (NR_IRQS_LEGACY)
static inline int mp_find_ioapic(u32 gsi) { return 0; }
-
-struct io_apic_irq_attr;
-static inline int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr) { return 0; }
+static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline void mp_unmap_irq(int irq) { }
+static inline bool mp_should_keep_irq(struct device *dev) { return 1; }
static inline int save_ioapic_entries(void)
{
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 000000000000..d1b5d194e31d
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 17483a492f18..d2434c1cad05 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -23,6 +23,9 @@
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/bootparam.h>
+
+struct kimage;
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,6 +64,10 @@
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
+/* Memory to backup during crash kdump */
+#define KEXEC_BACKUP_SRC_START (0UL)
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+
/*
* CPU does not save ss and sp on stack if execution is already
* running in kernel mode at the time of NMI occurrence. This code
@@ -160,6 +167,44 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ /* Details of backup region */
+ unsigned long backup_src_start;
+ unsigned long backup_src_sz;
+
+ /* Physical address of backup segment */
+ unsigned long backup_load_addr;
+
+ /* Core ELF header buffer */
+ void *elf_headers;
+ unsigned long elf_headers_sz;
+ unsigned long elf_load_addr;
+};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
};
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a04fe4eb237d..eb181178fe0b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -37,6 +37,7 @@ struct x86_instruction_info {
u8 modrm_reg; /* index of register used */
u8 modrm_rm; /* rm part of modrm */
u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
u8 src_bytes; /* size of source operand */
u8 dst_bytes; /* size of destination operand */
u8 ad_bytes; /* size of src/dst address */
@@ -194,6 +195,7 @@ struct x86_emulate_ops {
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
@@ -231,7 +233,7 @@ struct operand {
union {
unsigned long val;
u64 val64;
- char valptr[sizeof(unsigned long) + 2];
+ char valptr[sizeof(sse128_t)];
sse128_t vec_val;
u64 mm_val;
void *data;
@@ -240,8 +242,8 @@ struct operand {
struct fetch_cache {
u8 data[15];
- unsigned long start;
- unsigned long end;
+ u8 *ptr;
+ u8 *end;
};
struct read_cache {
@@ -286,30 +288,36 @@ struct x86_emulate_ctxt {
u8 opcode_len;
u8 b;
u8 intercept;
- u8 lock_prefix;
- u8 rep_prefix;
u8 op_bytes;
u8 ad_bytes;
- u8 rex_prefix;
struct operand src;
struct operand src2;
struct operand dst;
- bool has_seg_override;
- u8 seg_override;
- u64 d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * The following six fields are cleared together,
+ * the rest are initialized unconditionally in x86_decode_insn
+ * or elsewhere
+ */
+ bool rip_relative;
+ u8 rex_prefix;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u32 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u32 regs_dirty;
/* modrm */
u8 modrm;
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
u8 modrm_seg;
- bool rip_relative;
+ u8 seg_override;
+ u64 d;
unsigned long _eip;
struct operand memop;
- u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
- u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
/* Fields above regs are cleared together. */
unsigned long _regs[NR_VCPU_REGS];
struct operand *memopp;
@@ -407,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_OK 0
#define EMULATION_RESTART 1
#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int idt_index, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49205d01b9ad..7c492ed9087b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 10
+#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
@@ -152,14 +152,16 @@ enum {
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
-#define DR6_FIXED_1 0xffff0ff0
-#define DR6_VOLATILE 0x0000e00f
+#define DR6_RTM (1 << 16)
+#define DR6_FIXED_1 0xfffe0ff0
+#define DR6_INIT 0xffff0ff0
+#define DR6_VOLATILE 0x0001e00f
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
#define DR7_FIXED_1 0x00000400
-#define DR7_VOLATILE 0xffff23ff
+#define DR7_VOLATILE 0xffff2bff
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -448,7 +450,7 @@ struct kvm_vcpu_arch {
u64 tsc_offset_adjustment;
u64 this_tsc_nsec;
u64 this_tsc_write;
- u8 this_tsc_generation;
+ u64 this_tsc_generation;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
@@ -591,7 +593,7 @@ struct kvm_arch {
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
- u8 cur_tsc_generation;
+ u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
spinlock_t pvclock_gtod_sync_lock;
@@ -717,7 +719,7 @@ struct kvm_x86_ops {
int (*handle_exit)(struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu);
@@ -1070,6 +1072,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index a55c7efcc4ed..0f555cc31984 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -13,7 +13,7 @@
#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
#endif
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32)
/*
* This lock provides nmi access to the CMOS/RTC registers. It has some
* special properties. It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index be12c534fd59..166af2a8e865 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -3,6 +3,10 @@
#include <asm/desc.h>
#include <linux/atomic.h>
+#include <linux/mm_types.h>
+
+#include <trace/events/tlb.h>
+
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Re-load page tables */
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_LDT_nolock(&next->context);
}
}
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index f5a617956735..b07233b64578 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
-extern unsigned int max_physical_apicid;
-extern int mpc_default_type;
extern unsigned long mp_lapic_addr;
#ifdef CONFIG_X86_LOCAL_APIC
@@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
-#ifdef CONFIG_ACPI
-extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
- u32 gsi);
-extern void mp_config_acpi_legacy_irqs(void);
-struct device;
-extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
- int active_high_low);
-#endif /* CONFIG_ACPI */
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
@@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
extern physid_mask_t phys_cpu_present_map;
-extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
-
-extern int default_acpi_madt_oem_check(char *, char *);
-
#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
index 0208c3c2cbc6..85e6cda45a02 100644
--- a/arch/x86/include/asm/mutex_32.h
+++ b/arch/x86/include/asm/mutex_32.h
@@ -100,23 +100,11 @@ do { \
static inline int __mutex_fastpath_trylock(atomic_t *count,
int (*fail_fn)(atomic_t *))
{
- /*
- * We have two variants here. The cmpxchg based one is the best one
- * because it never induce a false contention state. It is included
- * here because architectures using the inc/dec algorithms over the
- * xchg ones are much more likely to support cmpxchg natively.
- *
- * If not we fall back to the spinlock based variant - that is
- * just as efficient (and simpler) as a 'destructive' probing of
- * the mutex state would be.
- */
-#ifdef __HAVE_ARCH_CMPXCHG
+ /* cmpxchg because it never induces a false contention state. */
if (likely(atomic_cmpxchg(count, 1, 0) == 1))
return 1;
+
return 0;
-#else
- return fail_fn(count);
-#endif
}
#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f9..a1410db38a1a 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -43,7 +43,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
- if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+ if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 775873d3be55..802dde30c928 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#define __HAVE_ARCH_GATE_AREA 1
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 0f1ddee6a0ce..f408caf73430 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,4 +39,6 @@ void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
+#define __HAVE_ARCH_GATE_AREA 1
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 851bcdc5db04..fd472181a1d0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -52,10 +52,9 @@
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
*/
-#define raw_cpu_ptr(ptr) \
+#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- __verify_pcpu_ptr(ptr); \
asm volatile("add " __percpu_arg(1) ", %0" \
: "=r" (tcp_ptr__) \
: "m" (this_cpu_off), "0" (ptr)); \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0ec056012618..aa97a070f09f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
+ * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
+ * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
+ */
+ return (pte_flags(pte) & _PAGE_SPECIAL) &&
+ (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
}
static inline unsigned long pte_pfn(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 5be9063545d2..3874693c0e53 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -19,6 +19,7 @@ extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
+extern pte_t level1_fixmap_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
new file mode 100644
index 000000000000..0a4e140315b6
--- /dev/null
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -0,0 +1,78 @@
+/*
+ * platform_sst_audio.h: sst audio platform data header file
+ *
+ * Copyright (C) 2012-14 Intel Corporation
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ * Omair Mohammed Abdullah <omair.m.abdullah@intel.com>
+ * Vinod Koul ,vinod.koul@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_SST_AUDIO_H_
+#define _PLATFORM_SST_AUDIO_H_
+
+#include <linux/sfi.h>
+
+enum sst_audio_task_id_mrfld {
+ SST_TASK_ID_NONE = 0,
+ SST_TASK_ID_SBA = 1,
+ SST_TASK_ID_MEDIA = 3,
+ SST_TASK_ID_MAX = SST_TASK_ID_MEDIA,
+};
+
+/* Device IDs for Merrifield are Pipe IDs,
+ * ref: DSP spec v0.75 */
+enum sst_audio_device_id_mrfld {
+ /* Output pipeline IDs */
+ PIPE_ID_OUT_START = 0x0,
+ PIPE_CODEC_OUT0 = 0x2,
+ PIPE_CODEC_OUT1 = 0x3,
+ PIPE_SPROT_LOOP_OUT = 0x4,
+ PIPE_MEDIA_LOOP1_OUT = 0x5,
+ PIPE_MEDIA_LOOP2_OUT = 0x6,
+ PIPE_VOIP_OUT = 0xC,
+ PIPE_PCM0_OUT = 0xD,
+ PIPE_PCM1_OUT = 0xE,
+ PIPE_PCM2_OUT = 0xF,
+ PIPE_MEDIA0_OUT = 0x12,
+ PIPE_MEDIA1_OUT = 0x13,
+/* Input Pipeline IDs */
+ PIPE_ID_IN_START = 0x80,
+ PIPE_CODEC_IN0 = 0x82,
+ PIPE_CODEC_IN1 = 0x83,
+ PIPE_SPROT_LOOP_IN = 0x84,
+ PIPE_MEDIA_LOOP1_IN = 0x85,
+ PIPE_MEDIA_LOOP2_IN = 0x86,
+ PIPE_VOIP_IN = 0x8C,
+ PIPE_PCM0_IN = 0x8D,
+ PIPE_PCM1_IN = 0x8E,
+ PIPE_MEDIA0_IN = 0x8F,
+ PIPE_MEDIA1_IN = 0x90,
+ PIPE_MEDIA2_IN = 0x91,
+ PIPE_RSVD = 0xFF,
+};
+
+/* The stream map for each platform consists of an array of the below
+ * stream map structure.
+ */
+struct sst_dev_stream_map {
+ u8 dev_num; /* device id */
+ u8 subdev_num; /* substream */
+ u8 direction;
+ u8 device_id; /* fw id */
+ u8 task_id; /* fw task */
+ u8 status;
+};
+
+struct sst_platform_data {
+ /* Intel software platform id*/
+ struct sst_dev_stream_map *pdev_strm_map;
+ unsigned int strm_map_size;
+};
+
+int add_sst_platform_device(void);
+#endif
+
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
new file mode 100644
index 000000000000..fc7a17c05d35
--- /dev/null
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -0,0 +1,107 @@
+/*
+ * Intel Atom SOC Power Management Controller Header File
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_ATOM_H
+#define PMC_ATOM_H
+
+/* ValleyView Power Control Unit PCI Device ID */
+#define PCI_DEVICE_ID_VLV_PMC 0x0F1C
+
+/* PMC Memory mapped IO registers */
+#define PMC_BASE_ADDR_OFFSET 0x44
+#define PMC_BASE_ADDR_MASK 0xFFFFFE00
+#define PMC_MMIO_REG_LEN 0x100
+#define PMC_REG_BIT_WIDTH 32
+
+/* BIOS uses FUNC_DIS to disable specific function */
+#define PMC_FUNC_DIS 0x34
+#define PMC_FUNC_DIS_2 0x38
+
+/* S0ix wake event control */
+#define PMC_S0IX_WAKE_EN 0x3C
+
+#define BIT_LPC_CLOCK_RUN BIT(4)
+#define BIT_SHARED_IRQ_GPSC BIT(5)
+#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18)
+#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19)
+#define BIT_SHARED_IRQ_GPSS BIT(20)
+
+#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \
+ BIT_SHARED_IRQ_GPSC | \
+ BIT_ORED_DEDICATED_IRQ_GPSS | \
+ BIT_ORED_DEDICATED_IRQ_GPSC | \
+ BIT_SHARED_IRQ_GPSS)
+
+/* The timers acumulate time spent in sleep state */
+#define PMC_S0IR_TMR 0x80
+#define PMC_S0I1_TMR 0x84
+#define PMC_S0I2_TMR 0x88
+#define PMC_S0I3_TMR 0x8C
+#define PMC_S0_TMR 0x90
+/* Sleep state counter is in units of of 32us */
+#define PMC_TMR_SHIFT 5
+
+/* These registers reflect D3 status of functions */
+#define PMC_D3_STS_0 0xA0
+
+#define BIT_LPSS1_F0_DMA BIT(0)
+#define BIT_LPSS1_F1_PWM1 BIT(1)
+#define BIT_LPSS1_F2_PWM2 BIT(2)
+#define BIT_LPSS1_F3_HSUART1 BIT(3)
+#define BIT_LPSS1_F4_HSUART2 BIT(4)
+#define BIT_LPSS1_F5_SPI BIT(5)
+#define BIT_LPSS1_F6_XXX BIT(6)
+#define BIT_LPSS1_F7_XXX BIT(7)
+#define BIT_SCC_EMMC BIT(8)
+#define BIT_SCC_SDIO BIT(9)
+#define BIT_SCC_SDCARD BIT(10)
+#define BIT_SCC_MIPI BIT(11)
+#define BIT_HDA BIT(12)
+#define BIT_LPE BIT(13)
+#define BIT_OTG BIT(14)
+#define BIT_USH BIT(15)
+#define BIT_GBE BIT(16)
+#define BIT_SATA BIT(17)
+#define BIT_USB_EHCI BIT(18)
+#define BIT_SEC BIT(19)
+#define BIT_PCIE_PORT0 BIT(20)
+#define BIT_PCIE_PORT1 BIT(21)
+#define BIT_PCIE_PORT2 BIT(22)
+#define BIT_PCIE_PORT3 BIT(23)
+#define BIT_LPSS2_F0_DMA BIT(24)
+#define BIT_LPSS2_F1_I2C1 BIT(25)
+#define BIT_LPSS2_F2_I2C2 BIT(26)
+#define BIT_LPSS2_F3_I2C3 BIT(27)
+#define BIT_LPSS2_F4_I2C4 BIT(28)
+#define BIT_LPSS2_F5_I2C5 BIT(29)
+#define BIT_LPSS2_F6_I2C6 BIT(30)
+#define BIT_LPSS2_F7_I2C7 BIT(31)
+
+#define PMC_D3_STS_1 0xA4
+#define BIT_SMB BIT(0)
+#define BIT_OTG_SS_PHY BIT(1)
+#define BIT_USH_SS_PHY BIT(2)
+#define BIT_DFX BIT(3)
+
+/* PMC I/O Registers */
+#define ACPI_BASE_ADDR_OFFSET 0x40
+#define ACPI_BASE_ADDR_MASK 0xFFFFFE00
+#define ACPI_MMIO_REG_LEN 0x100
+
+#define PM1_CNT 0x4
+#define SLEEP_TYPE_MASK 0xFFFFECFF
+#define SLEEP_TYPE_S5 0x1C00
+#define SLEEP_ENABLE 0x2000
+#endif /* PMC_ATOM_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a4ea02351f4d..eb71ec794732 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
-extern s8 __read_mostly tlb_flushall_shift;
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
@@ -386,8 +385,8 @@ struct bndcsr_struct {
struct xsave_hdr_struct {
u64 xstate_bv;
- u64 reserved1[2];
- u64 reserved2[5];
+ u64 xcomp_bv;
+ u64 reserved[6];
} __attribute__((packed));
struct xsave_struct {
@@ -696,6 +695,8 @@ static inline void cpu_relax(void)
rep_nop();
}
+#define cpu_relax_lowlatency() cpu_relax()
+
/* Stop speculative execution and prefetching of modified code. */
static inline void sync_core(void)
{
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index fbeb06ed0eaa..1d081ac1cd69 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -26,12 +26,10 @@
extern int of_ioapic;
extern u64 initial_dtb;
extern void add_dtb(u64 data);
-extern void x86_add_irq_domains(void);
void x86_of_pci_init(void);
void x86_dtb_init(void);
#else
static inline void add_dtb(u64 data) { }
-static inline void x86_add_irq_domains(void) { }
static inline void x86_of_pci_init(void) { }
static inline void x86_dtb_init(void) { }
#define of_ioapic 0
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index 70f46f07f94e..ae0e241e228b 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -3,7 +3,7 @@
#include <asm-generic/qrwlock_types.h>
-#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+#ifndef CONFIG_X86_PPRO_FENCE
#define queue_write_unlock queue_write_unlock
static inline void queue_write_unlock(struct qrwlock *lock)
{
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 4240878b9d76..000000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 49adfd7bb4a4..0da7409f0bec 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
- start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
pr_debug("2.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
- start_eip & 0xf;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
pr_debug("3.\n");
}
@@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
CMOS_WRITE(0, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0b46ef261c77..2d60a7813dfe 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -73,6 +73,7 @@
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
#define BAU_MISC_CONTROL_MULT_MASK 3
@@ -93,6 +94,8 @@
#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
#define write_gmmr uv_write_global_mmr64
#define write_lmmr uv_write_local_mmr
#define read_lmmr uv_read_local_mmr
@@ -322,8 +325,9 @@ struct uv1_bau_msg_header {
/*
* UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
* see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
*/
-struct uv2_bau_msg_header {
+struct uv2_3_bau_msg_header {
unsigned int base_dest_nasid:15; /* nasid of the first bit */
/* bits 14:0 */ /* in uvhub map */
unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */
@@ -395,7 +399,7 @@ struct bau_desc {
*/
union bau_msg_header {
struct uv1_bau_msg_header uv1_hdr;
- struct uv2_bau_msg_header uv2_hdr;
+ struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
struct bau_msg_payload payload;
@@ -631,11 +635,6 @@ struct bau_control {
struct hub_and_pnode *thp;
};
-static inline unsigned long read_mmr_uv2_status(void)
-{
- return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
-}
-
static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
@@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v)
*/
static inline int atom_asr(short i, struct atomic_short *v)
{
- return i + xadd(&v->counter, i);
+ short __i = i;
+ asm volatile(LOCK_PREFIX "xaddw %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
}
/*
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 30be253dd283..8021bd28c0f1 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -18,15 +18,15 @@ struct vdso_image {
unsigned long alt, alt_len;
- unsigned long sym_end_mapping; /* Total size of the mapping */
-
- unsigned long sym_vvar_page;
- unsigned long sym_hpet_page;
- unsigned long sym_VDSO32_NOTE_MASK;
- unsigned long sym___kernel_sigreturn;
- unsigned long sym___kernel_rt_sigreturn;
- unsigned long sym___kernel_vsyscall;
- unsigned long sym_VDSO32_SYSENTER_RETURN;
+ long sym_vvar_start; /* Negative offset to the vvar area */
+
+ long sym_vvar_page;
+ long sym_hpet_page;
+ long sym_VDSO32_NOTE_MASK;
+ long sym___kernel_sigreturn;
+ long sym___kernel_rt_sigreturn;
+ long sym___kernel_vsyscall;
+ long sym_VDSO32_SYSENTER_RETURN;
};
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index 44282fbf7bf9..c4b9dc2f67c5 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -17,10 +17,4 @@
#define vga_readb(x) (*(x))
#define vga_writeb(x, y) (*(y) = (x))
-#ifdef CONFIG_FB_EFI
-#define __ARCH_HAS_VGA_DEFAULT_DEVICE
-extern struct pci_dev *vga_default_device(void);
-extern void vga_set_default_device(struct pci_dev *pdev);
-#endif
-
#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 7004d21e6219..bcbfade26d8d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -51,6 +51,9 @@
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
@@ -76,7 +79,7 @@
#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
-#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
@@ -89,7 +92,7 @@
#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
-#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d949ef28c48b..7e7a79ada658 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -52,24 +52,170 @@ extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+#define xstate_fault ".section .fixup,\"ax\"\n" \
+ "3: movl $-1,%[err]\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
{
- int err;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XSAVES"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XSAVE"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
- asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
- : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XRSTORS"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XRSTOR"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline int xsave_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ /*
+ * If xsaves is enabled, xsaves replaces xsaveopt because
+ * it supports compact format and supervisor states in addition to
+ * modified optimization in xsaveopt.
+ *
+ * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+ * because xsaveopt supports modified optimization which is not
+ * supported by xsave.
+ *
+ * If none of xsaves and xsaveopt is enabled, use xsave.
+ */
+ alternative_input_2(
+ "1:"XSAVE,
+ "1:"XSAVEOPT,
+ X86_FEATURE_XSAVEOPT,
+ "1:"XSAVES,
+ X86_FEATURE_XSAVES,
+ [fx] "D" (fx), "a" (lmask), "d" (hmask) :
+ "memory");
+ asm volatile("2:\n\t"
+ xstate_fault
+ : "0" (0)
: "memory");
return err;
}
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+ int err = 0;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ /*
+ * Use xrstors to restore context if it is enabled. xrstors supports
+ * compacted format of xsave area which is not supported by xrstor.
+ */
+ alternative_input(
+ "1: " XRSTOR,
+ "1: " XRSTORS,
+ X86_FEATURE_XSAVES,
+ "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile("2:\n"
+ xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save xstate context for old process during context switch.
+ */
+static inline void fpu_xsave(struct fpu *fpu)
+{
+ xsave_state(&fpu->state->xsave, -1);
+}
+
+/*
+ * Restore xstate context for new process during context switch.
+ */
+static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+{
+ return xrstor_state(fx, -1);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
static inline int xsave_user(struct xsave_struct __user *buf)
{
int err;
@@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf)
return -EFAULT;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+ "1:"XSAVE"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (buf), "a" (-1), "d" (-1), "0" (0)
: "memory");
return err;
}
+/*
+ * Restore xstate from user space xsave area.
+ */
static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
{
- int err;
+ int err = 0;
struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+ "1:"XRSTOR"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
: "memory"); /* memory required? */
return err;
}
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
+void setup_xstate_comp(void);
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- alternative_input(
- ".byte " REX_PREFIX "0x0f,0xae,0x27",
- ".byte " REX_PREFIX "0x0f,0xae,0x37",
- X86_FEATURE_XSAVEOPT,
- [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
- "memory");
-}
#endif
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 09409c44f9a5..3dec769cadf7 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -22,6 +22,7 @@ header-y += ipcbuf.h
header-y += ist.h
header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_perf.h
header-y += ldt.h
header-y += mce.h
header-y += mman.h
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d3a87780c70b..d7dcef58aefa 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -23,7 +23,10 @@
#define GP_VECTOR 13
#define PF_VECTOR 14
#define MF_VECTOR 16
+#define AC_VECTOR 17
#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h
new file mode 100644
index 000000000000..3bb964f88aa1
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_KVM_PERF_H
+#define _ASM_X86_KVM_PERF_H
+
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID "vcpu_id"
+
+#define KVM_ENTRY_TRACE "kvm:kvm_entry"
+#define KVM_EXIT_TRACE "kvm:kvm_exit"
+#define KVM_EXIT_REASON "exit_reason"
+
+#endif /* _ASM_X86_KVM_PERF_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index fcf2b3ae1bf0..e21331ce368f 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -149,6 +149,9 @@
#define MSR_CORE_C1_RES 0x00000660
+#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
+#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+
#define MSR_AMD64_MC0_MASK 0xc0010044
#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
@@ -297,6 +300,8 @@
#define MSR_IA32_TSC_ADJUST 0x0000003b
#define MSR_IA32_BNDCFGS 0x00000d90
+#define MSR_IA32_XSS 0x00000da0
+
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
@@ -558,6 +563,7 @@
/* VMX_BASIC bits and bitmasks */
#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
#define VMX_BASIC_64 0x0001000000000000LLU
#define VMX_BASIC_MEM_TYPE_SHIFT 50
#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 047f9ff2e36c..ada2e2d6be3e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
@@ -106,6 +107,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
+obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b22581472..3242e591fa82 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000000..c280df6b2aa2
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,62 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+ return 0;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+void arch_apei_flush_tlb_one(unsigned long addr)
+{
+ __flush_tlb_one(addr);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 86281ffb96d6..b436fc735aa4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,6 +31,7 @@
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
@@ -43,6 +44,7 @@
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include <asm/i8259.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -74,10 +76,6 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
-#endif
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -97,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
-static unsigned int gsi_to_irq(unsigned int gsi)
-{
- unsigned int irq = gsi + NR_IRQS_LEGACY;
- unsigned int i;
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- if (isa_irq_to_gsi[i] == gsi) {
- return i;
- }
- }
-
- /* Provide an identity mapping of gsi == irq
- * except on truly weird platforms that have
- * non isa irqs in the first 16 gsis.
- */
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
-
- return irq;
-}
-
-static u32 irq_to_gsi(int irq)
-{
- unsigned int gsi;
-
- if (irq < NR_IRQS_LEGACY)
- gsi = isa_irq_to_gsi[irq];
- else if (irq < gsi_top)
- gsi = irq;
- else if (irq < (gsi_top + NR_IRQS_LEGACY))
- gsi = irq - gsi_top;
- else
- gsi = 0xffffffff;
-
- return gsi;
-}
+#define ACPI_INVALID_GSI INT_MIN
/*
* This is just a simple wrapper around early_ioremap(),
@@ -345,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#endif /*CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ int ioapic;
+ int pin;
+ struct mpc_intsrc mp_irq;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq; /* IRQ */
+ mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
+ mp_irq.dstirq = pin; /* INTIN# */
+
+ mp_save_irq(&mp_irq);
+
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev || !dev_is_pci(dev))
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+ int irq, node;
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return gsi;
+
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
+ pr_warn("Failed to set pin attr for GSI%d\n", gsi);
+ return -1;
+ }
+
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
+ if (irq < 0)
+ return irq;
+
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+ return irq;
+}
+
+static void mp_unregister_gsi(u32 gsi)
+{
+ int irq;
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return;
+
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return;
+
+ irq = mp_map_gsi_to_irq(gsi, 0);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+}
+
+static struct irq_domain_ops acpi_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
+};
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &acpi_irqdomain_ops,
+ };
ioapic = (struct acpi_madt_io_apic *)header;
@@ -358,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
acpi_table_print_madt_entry(header);
- mp_register_ioapic(ioapic->id,
- ioapic->address, ioapic->global_irq_base);
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
return 0;
}
@@ -382,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
- /*
- * mp_config_acpi_legacy_irqs() already setup IRQs < 16
- * If GSI is < 16, this will update its flags,
- * else it will create a new mp_irqs[] entry.
- */
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
/*
@@ -508,25 +602,28 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
outb(new >> 8, 0x4d1);
}
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- *irq = gsi_to_irq(gsi);
+ int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
-#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
- setup_IO_APIC_irq_extra(gsi);
-#endif
+ if (irq >= 0) {
+ *irqp = irq;
+ return 0;
+ }
- return 0;
+ return -1;
}
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
- if (isa_irq >= 16)
- return -1;
- *gsi = irq_to_gsi(isa_irq);
- return 0;
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
}
static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
@@ -546,15 +643,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int trigger, int polarity)
{
+ int irq = gsi;
+
#ifdef CONFIG_X86_IO_APIC
- gsi = mp_register_gsi(dev, gsi, trigger, polarity);
+ irq = mp_register_gsi(dev, gsi, trigger, polarity);
#endif
- return gsi;
+ return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+ mp_unregister_gsi(gsi);
+#endif
}
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
#ifdef CONFIG_ACPI_SLEEP
int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
@@ -568,32 +675,22 @@ int (*acpi_suspend_lowlevel)(void);
*/
int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
-
- plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
- irq = gsi_to_irq(plat_gsi);
-
- return irq;
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
}
EXPORT_SYMBOL_GPL(acpi_register_gsi);
void acpi_unregister_gsi(u32 gsi)
{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
}
EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
-void __init acpi_set_irq_model_pic(void)
-{
- acpi_irq_model = ACPI_IRQ_MODEL_PIC;
- __acpi_register_gsi = acpi_register_gsi_pic;
- acpi_ioapic = 0;
-}
-
-void __init acpi_set_irq_model_ioapic(void)
+static void __init acpi_set_irq_model_ioapic(void)
{
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
__acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
acpi_ioapic = 1;
}
@@ -829,9 +926,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
* and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing LAPIC address override entry\n");
@@ -856,9 +952,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
* and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing LAPIC address override entry\n");
@@ -886,11 +981,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
return count;
}
- x2count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
- acpi_parse_x2apic_nmi, 0);
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -901,44 +995,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
-#define MP_ISA_BUS 0
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- int ioapic;
- int pin;
- struct mpc_intsrc mp_irq;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = mp_find_ioapic_pin(ioapic, gsi);
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger << 2) | polarity;
- mp_irq.srcbus = MP_ISA_BUS;
- mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
- mp_irq.dstirq = pin; /* INTIN# */
-
- mp_save_irq(&mp_irq);
-
- isa_irq_to_gsi[bus_irq] = gsi;
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
{
int i;
struct mpc_intsrc mp_irq;
@@ -956,7 +1013,7 @@ void __init mp_config_acpi_legacy_irqs(void)
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < nr_legacy_irqs(); i++) {
int ioapic, pin;
unsigned int dstapic;
int idx;
@@ -1004,84 +1061,6 @@ void __init mp_config_acpi_legacy_irqs(void)
}
}
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
- int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- struct pci_dev *pdev;
- unsigned char number;
- unsigned int devfn;
- int ioapic;
- u8 pin;
-
- if (!acpi_ioapic)
- return 0;
- if (!dev || !dev_is_pci(dev))
- return 0;
-
- pdev = to_pci_dev(dev);
- number = pdev->bus->number;
- devfn = pdev->devfn;
- pin = pdev->pin;
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mpc_ioapic_id(ioapic);
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- mp_save_irq(&mp_irq);
-#endif
- return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
- int ioapic;
- int ioapic_pin;
- struct io_apic_irq_attr irq_attr;
- int ret;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
-
- ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mpc_ioapic_id(ioapic),
- ioapic_pin);
- return gsi;
- }
-
- if (enable_update_mptable)
- mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
- set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
- trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
- if (ret < 0)
- gsi = INT_MIN;
-
- return gsi;
-}
-
/*
* Parse IOAPIC related entries in MADT
* returns 0 on success, < 0 on error
@@ -1111,9 +1090,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
return -ENODEV;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
- MAX_IO_APICS);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
if (!count) {
printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
return -ENODEV;
@@ -1122,9 +1100,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
return count;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr, nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -1143,9 +1120,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src, nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad28db7e6bde..67760275544b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
/*
* The highest APIC ID seen during enumeration.
*/
-unsigned int max_physical_apicid;
+static unsigned int max_physical_apicid;
/*
* Bitmask of physically existing CPUs:
@@ -1342,17 +1342,6 @@ void setup_local_APIC(void)
/* always use the value from LDR */
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
logical_smp_processor_id();
-
- /*
- * Some NUMA implementations (NUMAQ) don't initialize apicid to
- * node mapping during NUMA init. Now that logical apicid is
- * guaranteed to be known, give it another chance. This is already
- * a bit too late - percpu allocation has already happened without
- * proper NUMA affinity.
- */
- if (apic->x86_32_numa_cpu_node)
- set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
- apic->x86_32_numa_cpu_node(cpu));
#endif
/*
@@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void)
imcr_pic_to_apic();
}
#endif
- if (apic->enable_apic_mode)
- apic->enable_apic_mode();
}
/**
@@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { }
#ifdef CONFIG_X86_64
-static int apic_cluster_num(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < nr_cpu_ids; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- } else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- } else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
-
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- return clusters;
-}
-
static int multi_checked;
static int multi;
@@ -2540,20 +2482,7 @@ static void dmi_check_multi(void)
int apic_is_clustered_box(void)
{
dmi_check_multi();
- if (multi)
- return 1;
-
- if (!is_vsmp_box())
- return 0;
-
- /*
- * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (apic_cluster_num() > 1)
- return 1;
-
- return 0;
+ return multi;
}
#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7c1b29479513..de918c410eae 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -168,21 +168,16 @@ static struct apic apic_flat = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -196,10 +191,7 @@ static struct apic apic_flat = {
.send_IPI_all = flat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
@@ -283,7 +275,6 @@ static struct apic apic_physflat = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
/* not needed, but shouldn't hurt: */
@@ -291,14 +282,10 @@ static struct apic apic_physflat = {
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -312,10 +299,7 @@ static struct apic apic_physflat = {
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 8c7c98249c20..b205cdbdbe6a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void)
return cpumask_of(0);
}
-static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static unsigned long noop_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
const struct cpumask *mask)
{
@@ -133,27 +123,21 @@ struct apic apic_noop = {
.target_cpus = noop_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = noop_check_apicid_used,
- .check_apicid_present = noop_check_apicid_present,
+ .check_apicid_used = default_check_apicid_used,
.vector_allocation_domain = noop_vector_allocation_domain,
.init_apic_ldr = noop_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = noop_phys_pkg_id,
- .mps_oem_check = NULL,
-
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
@@ -168,12 +152,7 @@ struct apic apic_noop = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- /* should be safe */
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a5b45df8bc88..ae915391ebec 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = numachip_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
@@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index e4840aa7a255..c4a8d63f8220 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
return 0;
}
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
- return 1;
-}
-
static int bigsmp_early_logical_apicid(int cpu)
{
/* on bigsmp, logical apicid is the same as physical */
@@ -168,21 +163,16 @@ static struct apic apic_bigsmp = {
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
- .check_apicid_present = bigsmp_check_apicid_present,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.setup_apic_routing = bigsmp_setup_apic_routing,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = bigsmp_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = bigsmp_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
@@ -196,11 +186,7 @@ static struct apic apic_bigsmp = {
.send_IPI_all = bigsmp_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = true,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81e08eff05ee..337ce5a9b15c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -31,6 +31,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/syscore_ops.h>
+#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/htirq.h>
#include <linux/freezer.h>
@@ -62,6 +63,16 @@
#define __apicdebuginit(type) static type __init
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
+
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
@@ -73,6 +84,17 @@ int sis_apic_bug = -1;
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_RAW_SPINLOCK(vector_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct mp_pin_info {
+ int trigger;
+ int polarity;
+ int node;
+ int set;
+ u32 count;
+};
static struct ioapic {
/*
@@ -87,7 +109,9 @@ static struct ioapic {
struct mpc_ioapic mp_config;
/* IO APIC gsi routing info */
struct mp_ioapic_gsi gsi_config;
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct mp_pin_info *pin_info;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
return &ioapics[ioapic_idx].gsi_config;
}
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+ if (!nr_legacy_irqs())
+ return 0;
+
+ return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
+}
+
+static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
+{
+ return ioapics[ioapic_idx].pin_info + pin;
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+ return ioapics[ioapic].irqdomain;
+}
+
int nr_ioapics;
/* The one past the highest gsi number used */
@@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
-/* GSI interrupts */
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
-
#ifdef CONFIG_EISA
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -149,8 +205,7 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-static int io_apic_setup_irq_pin(unsigned int irq, int node,
- struct io_apic_irq_attr *attr);
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
@@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
- int count, node, i;
+ int i, node = cpu_to_node(0);
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
io_apic_irqs = ~0UL;
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
ioapics[i].saved_registers =
kzalloc(sizeof(struct IO_APIC_route_entry) *
ioapics[i].nr_registers, GFP_KERNEL);
@@ -202,28 +253,20 @@ int __init arch_early_irq_init(void)
pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
}
- cfg = irq_cfgx;
- count = ARRAY_SIZE(irq_cfgx);
- node = cpu_to_node(0);
-
- for (i = 0; i < count; i++) {
- irq_set_chip_data(i, &cfg[i]);
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
- /*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
- */
- if (i < legacy_pic->nr_legacy_irqs) {
- cfg[i].vector = IRQ0_VECTOR + i;
- cpumask_setall(cfg[i].domain);
- }
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ cfg = alloc_irq_and_cfg_at(i, node);
+ cfg->vector = IRQ0_VECTOR + i;
+ cpumask_setall(cfg->domain);
}
return 0;
}
-static struct irq_cfg *irq_cfg(unsigned int irq)
+static inline struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq_get_chip_data(irq);
}
@@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
if (res < 0) {
if (res != -EEXIST)
return NULL;
- cfg = irq_get_chip_data(at);
+ cfg = irq_cfg(at);
if (cfg)
return cfg;
}
@@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
return 0;
}
+static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+{
+ struct irq_pin_list **last, *entry;
+
+ last = &cfg->irq_2_pin;
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ if (entry->apic == apic && entry->pin == pin) {
+ *last = entry->next;
+ kfree(entry);
+ return;
+ } else {
+ last = &entry->next;
+ }
+}
+
static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
if (__add_pin_to_irq_node(cfg, node, apic, pin))
@@ -627,9 +685,8 @@ static void clear_IO_APIC (void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
- clear_IO_APIC_pin(apic, pin);
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
}
#ifdef CONFIG_X86_32
@@ -678,13 +735,13 @@ int save_ioapic_entries(void)
int apic, pin;
int err = 0;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers) {
err = -ENOMEM;
continue;
}
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ for_each_pin(apic, pin)
ioapics[apic].saved_registers[pin] =
ioapic_read_entry(apic, pin);
}
@@ -699,11 +756,11 @@ void mask_ioapic_entries(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers)
continue;
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ for_each_pin(apic, pin) {
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
@@ -722,11 +779,11 @@ int restore_ioapic_entries(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers)
continue;
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ for_each_pin(apic, pin)
ioapic_write_entry(apic, pin,
ioapics[apic].saved_registers[pin]);
}
@@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int ioapic_idx;
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
return ioapic_idx;
}
@@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type)
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < legacy_pic->nr_legacy_irqs) {
+ if (irq < nr_legacy_irqs()) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -939,29 +996,106 @@ static int irq_trigger(int idx)
return trigger;
}
-static int pin_2_irq(int idx, int apic, int pin)
+static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+{
+ int irq = -1;
+ int ioapic = (int)(long)domain->host_data;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
+ * GSIs on some weird platforms.
+ */
+ if (gsi < nr_legacy_irqs())
+ irq = irq_create_mapping(domain, pin);
+ else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ irq = irq_create_mapping(domain, pin);
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ break;
+ }
+
+ return irq > 0 ? irq : -1;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags)
{
int irq;
- int bus = mp_irqs[idx].srcbus;
- struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ struct mp_pin_info *info = mp_pin_info(ioapic, pin);
+
+ if (!domain)
+ return -1;
+
+ mutex_lock(&ioapic_mutex);
/*
- * Debugging check, we are in big trouble if this message pops up!
+ * Don't use irqdomain to manage ISA IRQs because there may be
+ * multiple IOAPIC pins sharing the same ISA IRQ number and
+ * irqdomain only supports 1:1 mapping between IOAPIC pin and
+ * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
+ * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
+ * available, and some BIOSes may use MP Interrupt Source records
+ * to override IRQ numbers for PIRQs instead of reprogramming
+ * the interrupt routing logic. Thus there may be multiple pins
+ * sharing the same legacy IRQ number when ACPI is disabled.
*/
- if (mp_irqs[idx].dstirq != pin)
- pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci)) {
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
+ if (flags & IOAPIC_MAP_ALLOC) {
+ if (info->count == 0 &&
+ mp_irqdomain_map(domain, irq, pin) != 0)
+ irq = -1;
+
+ /* special handling for timer IRQ0 */
+ if (irq == 0)
+ info->count++;
+ }
} else {
- u32 gsi = gsi_cfg->gsi_base + pin;
+ irq = irq_find_mapping(domain, pin);
+ if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
+ irq = alloc_irq_from_domain(domain, gsi, pin);
+ }
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
+ if (flags & IOAPIC_MAP_ALLOC) {
+ /* special handling for legacy IRQs */
+ if (irq < nr_legacy_irqs() && info->count == 1 &&
+ mp_irqdomain_map(domain, irq, pin) != 0)
+ irq = -1;
+
+ if (irq > 0)
+ info->count++;
+ else if (info->count == 0)
+ info->set = 0;
}
+ mutex_unlock(&ioapic_mutex);
+
+ return irq > 0 ? irq : -1;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
#ifdef CONFIG_X86_32
/*
* PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -972,16 +1106,58 @@ static int pin_2_irq(int idx, int apic, int pin)
apic_printk(APIC_VERBOSE, KERN_DEBUG
"disabling PIRQ%d\n", pin-16);
} else {
- irq = pirq_entries[pin-16];
+ int irq = pirq_entries[pin-16];
apic_printk(APIC_VERBOSE, KERN_DEBUG
"using PIRQ%d -> IRQ %d\n",
pin-16, irq);
+ return irq;
}
}
}
#endif
- return irq;
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -1;
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ struct mp_pin_info *info;
+ int ioapic, pin;
+
+ if (!data || !data->domain)
+ return;
+
+ ioapic = (int)(long)data->domain->host_data;
+ pin = (int)data->hwirq;
+ info = mp_pin_info(ioapic, pin);
+
+ mutex_lock(&ioapic_mutex);
+ if (--info->count == 0) {
+ info->set = 0;
+ if (irq < nr_legacy_irqs() &&
+ ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
+ mp_irqdomain_unmap(data->domain, irq);
+ else
+ irq_dispose_mapping(irq);
+ }
+ mutex_unlock(&ioapic_mutex);
}
/*
@@ -991,7 +1167,7 @@ static int pin_2_irq(int idx, int apic, int pin)
int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
struct io_apic_irq_attr *irq_attr)
{
- int ioapic_idx, i, best_guess = -1;
+ int irq, i, best_ioapic = -1, best_idx = -1;
apic_printk(APIC_DEBUG,
"querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1001,44 +1177,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
"PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
+
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+ continue;
+
+ for_each_ioapic(ioapic_idx)
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
break;
+ }
+ if (!found)
+ continue;
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
+ continue;
- if (!(ioapic_idx || IO_APIC_IRQ(irq)))
- continue;
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
+ }
- if (pin == (mp_irqs[i].srcbusirq & 3)) {
- set_io_apic_irq_attr(irq_attr, ioapic_idx,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- return irq;
- }
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0) {
- set_io_apic_irq_attr(irq_attr, ioapic_idx,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- best_guess = irq;
- }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
}
}
- return best_guess;
+ if (best_idx < 0)
+ return -1;
+
+out:
+ irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+ IOAPIC_MAP_ALLOC);
+ if (irq > 0)
+ set_io_apic_irq_attr(irq_attr, best_ioapic,
+ mp_irqs[best_idx].dstirq,
+ irq_trigger(best_idx),
+ irq_polarity(best_idx));
+ return irq;
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
@@ -1198,7 +1386,7 @@ void __setup_vector_irq(int cpu)
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (!cfg)
continue;
@@ -1227,12 +1415,10 @@ static inline int IO_APIC_irq_trigger(int irq)
{
int apic, idx, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
+ for_each_ioapic_pin(apic, pin) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
+ return irq_trigger(idx);
}
/*
* nonexistent IRQs are edge default
@@ -1330,95 +1516,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
}
ioapic_register_intr(irq, cfg, attr->trigger);
- if (irq < legacy_pic->nr_legacy_irqs)
+ if (irq < nr_legacy_irqs())
legacy_pic->mask(irq);
ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
}
-static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
-{
- if (idx != -1)
- return false;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
- mpc_ioapic_id(ioapic_idx), pin);
- return true;
-}
-
-static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
-{
- int idx, node = cpu_to_node(0);
- struct io_apic_irq_attr attr;
- unsigned int pin, irq;
-
- for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
- idx = find_irq_entry(ioapic_idx, pin, mp_INT);
- if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
- continue;
-
- irq = pin_2_irq(idx, ioapic_idx, pin);
-
- if ((ioapic_idx > 0) && (irq > 16))
- continue;
-
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(ioapic_idx, irq))
- continue;
-
- set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
- irq_polarity(idx));
-
- io_apic_setup_irq_pin(irq, node, &attr);
- }
-}
-
static void __init setup_IO_APIC_irqs(void)
{
- unsigned int ioapic_idx;
+ unsigned int ioapic, pin;
+ int idx;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
- __io_apic_setup_irqs(ioapic_idx);
-}
-
-/*
- * for the gsit that is not in first ioapic
- * but could not use acpi_register_gsi()
- * like some special sci in IBM x3330
- */
-void setup_IO_APIC_irq_extra(u32 gsi)
-{
- int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
- struct io_apic_irq_attr attr;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic_idx = mp_find_ioapic(gsi);
- if (ioapic_idx < 0)
- return;
-
- pin = mp_find_ioapic_pin(ioapic_idx, gsi);
- idx = find_irq_entry(ioapic_idx, pin, mp_INT);
- if (idx == -1)
- return;
-
- irq = pin_2_irq(idx, ioapic_idx, pin);
-
- /* Only handle the non legacy irqs on secondary ioapics */
- if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
- return;
-
- set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
- irq_polarity(idx));
-
- io_apic_setup_irq_pin_once(irq, node, &attr);
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ else
+ pin_2_irq(idx, ioapic, pin,
+ ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
/*
@@ -1586,7 +1706,7 @@ __apicdebuginit(void) print_IO_APICs(void)
struct irq_chip *chip;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
mpc_ioapic_id(ioapic_idx),
ioapics[ioapic_idx].nr_registers);
@@ -1597,7 +1717,7 @@ __apicdebuginit(void) print_IO_APICs(void)
*/
printk(KERN_INFO "testing the IO APIC.......................\n");
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
print_IO_APIC(ioapic_idx);
printk(KERN_DEBUG "IRQ to pin mappings:\n");
@@ -1608,7 +1728,7 @@ __apicdebuginit(void) print_IO_APICs(void)
if (chip != &ioapic_chip)
continue;
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (!cfg)
continue;
entry = cfg->irq_2_pin;
@@ -1758,7 +1878,7 @@ __apicdebuginit(void) print_PIC(void)
unsigned int v;
unsigned long flags;
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1828,26 +1948,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
int i8259_apic, i8259_pin;
- int apic;
+ int apic, pin;
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
- for(apic = 0; apic < nr_ioapics; apic++) {
- int pin;
+ for_each_ioapic_pin(apic, pin) {
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
}
}
found_i8259:
@@ -1919,7 +2035,7 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
x86_io_apic_ops.disable();
@@ -1950,7 +2066,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+ for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic_idx, 0);
@@ -2123,7 +2239,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < legacy_pic->nr_legacy_irqs) {
+ if (irq < nr_legacy_irqs()) {
legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
@@ -2225,7 +2341,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __this_cpu_write(vector_irq[vector], -1);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
unlock:
raw_spin_unlock(&desc->lock);
}
@@ -2253,7 +2369,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
void irq_force_complete_move(int irq)
{
- struct irq_cfg *cfg = irq_get_chip_data(irq);
+ struct irq_cfg *cfg = irq_cfg(irq);
if (!cfg)
return;
@@ -2514,26 +2630,15 @@ static inline void init_IO_APIC_traps(void)
struct irq_cfg *cfg;
unsigned int irq;
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
for_each_active_irq(irq) {
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < legacy_pic->nr_legacy_irqs)
+ if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
@@ -2649,8 +2754,6 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-int timer_through_8259 __initdata;
-
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2661,7 +2764,7 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_get_chip_data(0);
+ struct irq_cfg *cfg = irq_cfg(0);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
@@ -2755,7 +2858,6 @@ static inline void __init check_timer(void)
legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
goto out;
}
/*
@@ -2827,15 +2929,54 @@ out:
*/
#define PIC_IRQS (1UL << PIC_CASCADE_IR)
+static int mp_irqdomain_create(int ioapic)
+{
+ size_t size;
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
+ ip->pin_info = kzalloc(size, GFP_KERNEL);
+ if (!ip->pin_info)
+ return -ENOMEM;
+
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
+
+ ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+ if(!ip->irqdomain) {
+ kfree(ip->pin_info);
+ ip->pin_info = NULL;
+ return -ENOMEM;
+ }
+
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+ cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base,
+ gsi_cfg->gsi_end + 1);
+
+ if (gsi_cfg->gsi_base == 0)
+ irq_set_default_host(ip->irqdomain);
+
+ return 0;
+}
+
void __init setup_IO_APIC(void)
{
+ int ioapic;
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
- io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
+
/*
* Set up IO-APIC IRQ routing.
*/
@@ -2844,8 +2985,10 @@ void __init setup_IO_APIC(void)
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
- if (legacy_pic->nr_legacy_irqs)
+ if (nr_legacy_irqs())
check_timer();
+
+ ioapic_initialized = 1;
}
/*
@@ -2880,7 +3023,7 @@ static void ioapic_resume(void)
{
int ioapic_idx;
- for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+ for_each_ioapic_reverse(ioapic_idx)
resume_ioapic_id(ioapic_idx);
restore_ioapic_entries();
@@ -2926,7 +3069,7 @@ int arch_setup_hwirq(unsigned int irq, int node)
void arch_teardown_hwirq(unsigned int irq)
{
- struct irq_cfg *cfg = irq_get_chip_data(irq);
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
free_remapped_irq(irq);
@@ -3053,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
if (!irq_offset)
write_msi_msg(irq, &msg);
- setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
@@ -3192,7 +3335,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id)
hpet_msi_write(irq_get_handler_data(irq), &msg);
irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
return 0;
@@ -3303,27 +3446,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
return ret;
}
-int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr)
-{
- unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
- int ret;
- struct IO_APIC_route_entry orig_entry;
-
- /* Avoid redundant programming */
- if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
- orig_entry = ioapic_read_entry(attr->ioapic, pin);
- if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
- return 0;
- return -EBUSY;
- }
- ret = io_apic_setup_irq_pin(irq, node, attr);
- if (!ret)
- set_bit(pin, ioapics[ioapic_idx].pin_programmed);
- return ret;
-}
-
static int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -3340,20 +3462,13 @@ static int __init io_apic_get_redir_entries(int ioapic)
return reg_01.bits.entries + 1;
}
-static void __init probe_nr_irqs_gsi(void)
-{
- int nr;
-
- nr = gsi_top + NR_IRQS_LEGACY;
- if (nr > nr_irqs_gsi)
- nr_irqs_gsi = nr;
-
- printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
-}
-
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
- return from < nr_irqs_gsi ? nr_irqs_gsi : from;
+ /*
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+ */
+ return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
}
int __init arch_probe_nr_irqs(void)
@@ -3363,33 +3478,17 @@ int __init arch_probe_nr_irqs(void)
if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
nr_irqs = NR_VECTORS * nr_cpu_ids;
- nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
/*
* for MSI and HT dyn irq
*/
- nr += nr_irqs_gsi * 16;
+ nr += gsi_top * 16;
#endif
if (nr < nr_irqs)
nr_irqs = nr;
- return NR_IRQS_LEGACY;
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
-{
- int node;
-
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- irq_attr->ioapic);
- return -EINVAL;
- }
-
- node = dev ? dev_to_node(dev) : cpu_to_node(0);
-
- return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+ return 0;
}
#ifdef CONFIG_X86_32
@@ -3483,9 +3582,8 @@ static u8 __init io_apic_unique_id(u8 id)
DECLARE_BITMAP(used, 256);
bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i)
__set_bit(mpc_ioapic_id(i), used);
- }
if (!test_bit(id, used))
return id;
return find_first_zero_bit(used, 256);
@@ -3543,14 +3641,13 @@ void __init setup_ioapic_dest(void)
if (skip_ioapic_setup == 1)
return;
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
- for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+ for_each_ioapic_pin(ioapic, pin) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
- if ((ioapic > 0) && (irq > 16))
+ irq = pin_2_irq(irq_entry, ioapic, pin, 0);
+ if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
continue;
idata = irq_get_irq_data(irq);
@@ -3573,29 +3670,33 @@ void __init setup_ioapic_dest(void)
static struct resource *ioapic_resources;
-static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+static struct resource * __init ioapic_setup_resources(void)
{
unsigned long n;
struct resource *res;
char *mem;
- int i;
+ int i, num = 0;
- if (nr_ioapics <= 0)
+ for_each_ioapic(i)
+ num++;
+ if (num == 0)
return NULL;
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
- n *= nr_ioapics;
+ n *= num;
mem = alloc_bootmem(n);
res = (void *)mem;
- mem += sizeof(struct resource) * nr_ioapics;
+ mem += sizeof(struct resource) * num;
- for (i = 0; i < nr_ioapics; i++) {
- res[i].name = mem;
- res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ num = 0;
+ for_each_ioapic(i) {
+ res[num].name = mem;
+ res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
+ num++;
}
ioapic_resources = res;
@@ -3609,8 +3710,8 @@ void __init native_io_apic_init_mappings(void)
struct resource *ioapic_res;
int i;
- ioapic_res = ioapic_setup_resources(nr_ioapics);
- for (i = 0; i < nr_ioapics; i++) {
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
if (smp_found_config) {
ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
@@ -3641,8 +3742,6 @@ fake_ioapic_page:
ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
-
- probe_nr_irqs_gsi();
}
void __init ioapic_insert_resources(void)
@@ -3657,7 +3756,7 @@ void __init ioapic_insert_resources(void)
return;
}
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
insert_resource(&iomem_resource, r);
r++;
}
@@ -3665,16 +3764,15 @@ void __init ioapic_insert_resources(void)
int mp_find_ioapic(u32 gsi)
{
- int i = 0;
+ int i;
if (nr_ioapics == 0)
return -1;
/* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
- if ((gsi >= gsi_cfg->gsi_base)
- && (gsi <= gsi_cfg->gsi_end))
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
@@ -3686,7 +3784,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
struct mp_ioapic_gsi *gsi_cfg;
- if (WARN_ON(ioapic == -1))
+ if (WARN_ON(ioapic < 0))
return -1;
gsi_cfg = mp_ioapic_gsi_routing(ioapic);
@@ -3729,7 +3827,8 @@ static __init int bad_ioapic_register(int idx)
return 0;
}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg)
{
int idx = 0;
int entries;
@@ -3743,6 +3842,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
ioapics[idx].mp_config.type = MP_IOAPIC;
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
@@ -3779,6 +3880,97 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
nr_ioapics++;
}
+int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ioapic = (int)(long)domain->host_data;
+ struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
+ struct io_apic_irq_attr attr;
+
+ /* Get default attribute if not set by caller yet */
+ if (!info->set) {
+ u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+
+ if (acpi_get_override_irq(gsi, &info->trigger,
+ &info->polarity) < 0) {
+ /*
+ * PCI interrupts are always polarity one level
+ * triggered.
+ */
+ info->trigger = 1;
+ info->polarity = 1;
+ }
+ info->node = NUMA_NO_NODE;
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default
+ * trigger and polarity attributes. Don't set the flag for that
+ * case so the first legacy IRQ user could reprogram the pin
+ * with real trigger and polarity attributes.
+ */
+ if (virq >= nr_legacy_irqs() || info->count)
+ info->set = 1;
+ }
+ set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
+ info->polarity);
+
+ return io_apic_setup_irq_pin(virq, info->node, &attr);
+}
+
+void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_cfg *cfg = irq_cfg(virq);
+ int ioapic = (int)(long)domain->host_data;
+ int pin = (int)data->hwirq;
+
+ ioapic_mask_entry(ioapic, pin);
+ __remove_pin_from_irq(cfg, ioapic, pin);
+ WARN_ON(cfg->irq_2_pin != NULL);
+ arch_teardown_hwirq(virq);
+}
+
+int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+{
+ int ret = 0;
+ int ioapic, pin;
+ struct mp_pin_info *info;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ info = mp_pin_info(ioapic, pin);
+ trigger = trigger ? 1 : 0;
+ polarity = polarity ? 1 : 0;
+
+ mutex_lock(&ioapic_mutex);
+ if (!info->set) {
+ info->trigger = trigger;
+ info->polarity = polarity;
+ info->node = node;
+ info->set = 1;
+ } else if (info->trigger != trigger || info->polarity != polarity) {
+ ret = -EBUSY;
+ }
+ mutex_unlock(&ioapic_mutex);
+
+ return ret;
+}
+
+bool mp_should_keep_irq(struct device *dev)
+{
+ if (dev->power.is_prepared)
+ return true;
+#ifdef CONFIG_PM_RUNTIME
+ if (dev->power.runtime_status == RPM_SUSPENDING)
+ return true;
+#endif
+
+ return false;
+}
+
/* Enable IOAPIC early just for system timer */
void __init pre_init_apic_IRQ0(void)
{
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index cceb352c968c..bda488680dbc 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -88,21 +88,16 @@ static struct apic apic_default = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
- .check_apicid_present = default_check_apicid_present,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = setup_apic_flat_routing,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = default_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
@@ -116,11 +111,7 @@ static struct apic apic_default = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = true,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
@@ -214,29 +205,7 @@ void __init generic_apic_probe(void)
printk(KERN_INFO "Using APIC driver %s\n", apic->name);
}
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- if (!((*drv)->mps_oem_check))
- continue;
- if (!(*drv)->mps_oem_check(mpc, oem, productid))
- continue;
-
- if (!cmdline_apic) {
- apic = *drv;
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
-}
-
+/* This function can switch the APIC even after the initial ->probe() */
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
struct apic **drv;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e66766bf1641..6ce600f9bc78 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6d600ebf6c12..6fae733e9194 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 293b41df54ef..004f017aa7b9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -365,21 +365,16 @@ static struct apic __refdata apic_x2apic_uv_x = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = uv_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = uv_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
@@ -394,10 +389,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.send_IPI_self = uv_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ce8b8ff0e0ef..60e5497681f5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
@@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
return wrmsr_safe_regs(gprs);
}
-#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
@@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
@@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ printk(KERN_INFO
+ "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ set_cpu_cap(c, X86_FEATURE_K7);
+
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
+ if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
-}
-
-static void init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- msr_clear_bit(MSR_K7_HWCR, 15);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
#endif
+}
#ifdef CONFIG_NUMA
/*
@@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
@@ -515,101 +534,74 @@ static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-static void init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
- u32 dummy;
- unsigned long long value;
+ u32 level;
+ u64 value;
-#ifdef CONFIG_SMP
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf)
- msr_set_bit(MSR_K7_HWCR, 6);
-#endif
-
- early_init_amd(c);
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
*/
- clear_cpu_cap(c, 0*32+31);
-
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
-
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
- value &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, value);
- }
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
-
}
- if (c->x86 >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
- }
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
@@ -625,14 +617,60 @@ static void init_amd(struct cpuinfo_x86 *c)
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x15: init_amd_bd(c); break;
+ }
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
@@ -656,33 +694,6 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
-
- fam10h_check_enable_mmcfg();
- }
-
- if (c == &boot_cpu_data && c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
@@ -690,34 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (c->x86 == 0x10) {
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors already. If
- * it doesn't, do it here as suggested by the BKDG.
- *
- * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
- */
- msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-
- /*
- * On family 10h BIOS may not have properly enabled WC+ support,
- * causing it to be converted to CD memtype. This may result in
- * performance degradation for certain nested-paging guests.
- * Prevent this conversion by clearing bit 24 in
- * MSR_AMD64_BU_CFG2.
- *
- * NOTE: we want to use the _safe accessors so as not to #GP kvm
- * guests on older kvm hosts.
- */
- msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
-
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- }
-
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
@@ -741,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
- tlb_flushall_shift = 6;
-}
-
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
@@ -793,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
-
- cpu_set_tlb_flushall_shift(c);
}
static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ef1b93f18ed1..e4ab2b42bd6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
}
__setup("noxsaveopt", x86_xsaveopt_setup);
+static int __init x86_xsaves_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -481,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8 __read_mostly tlb_flushall_shift = -1;
-
void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
- "tlb_flushall_shift: %d\n",
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+ tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -634,6 +633,15 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[9] = ebx;
}
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[10] = eax;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f9e4fdd3b877..74e804ddc5c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -253,7 +253,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -402,7 +402,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
}
}
-static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
- switch ((c->x86 << 8) + c->x86_model) {
- case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 0x61d: /* six-core 45 nm xeon "Dunnington" */
- tlb_flushall_shift = -1;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 2;
- break;
- case 0x61a: /* 45 nm nehalem, "Bloomfield" */
- case 0x61e: /* 45 nm nehalem, "Lynnfield" */
- case 0x625: /* 32 nm nehalem, "Clarkdale" */
- case 0x62c: /* 32 nm nehalem, "Gulftown" */
- case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
- case 0x62f: /* 32 nm Xeon E7 */
- case 0x62a: /* SandyBridge */
- case 0x62d: /* SandyBridge, "Romely-EP" */
- default:
- tlb_flushall_shift = 6;
- }
-}
-
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
- intel_tlb_flushall_shift_set(c);
}
static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9c8f7394c612..c7035073dfc1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- if (strict_strtoul(buf, 10, &val) < 0)
+ if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
@@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return -EINVAL;
- if (strict_strtoul(buf, 16, &val) < 0)
+ if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
if (amd_set_subcaches(cpu, val))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9a79c8dbd8e8..bd9ccda8087f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
@@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.ignore_ce ^ !!new) {
@@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.cmci_disabled ^ !!new) {
@@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_rediscover();
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
@@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
}
- if (action == CPU_POST_DEAD) {
- /* intentionally ignoring frozen here */
- cmci_rediscover();
- }
-
return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 603df4f74640..1e49f8f41276 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
if (!b->interrupt_capable)
return -EINVAL;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
@@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 9a316b21df8b..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
@@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void)
int bank;
u64 val;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = __get_cpu_var(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
@@ -211,7 +211,7 @@ static void cmci_discover(int banks)
int i;
int bios_wrong_thresh = 0;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
@@ -266,7 +266,7 @@ static void cmci_discover(int banks)
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
@@ -316,10 +316,10 @@ void cmci_clear(void)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
@@ -360,9 +360,9 @@ void cmci_disable_bank(int bank)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 2bf616505499..e2b22df964cd 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,23 +1,25 @@
#!/bin/sh
#
-# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
-TABS="$(printf '\t\t\t\t\t')"
-trap 'rm "$OUT"' EXIT
+function dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
-(
- echo "#ifndef _ASM_X86_CPUFEATURE_H"
- echo "#include <asm/cpufeature.h>"
- echo "#endif"
- echo ""
- echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
- # Iterate through any input lines starting with #define X86_FEATURE_
- sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
@@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
- TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
- printf "\t[%s]%.*s = %s,\n" \
- "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
done
echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 3bbdf4cd38b9..30790d798e6b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
cpu_to_node(cpu));
}
-static void amd_uncore_cpu_up_prepare(unsigned int cpu)
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore;
+ struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_NB;
- uncore->rdpmc_base = RDPMC_BASE_NB;
- uncore->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore->active_mask = &amd_nb_active_mask;
- uncore->pmu = &amd_nb_pmu;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ uncore_nb = amd_uncore_alloc(cpu);
+ if (!uncore_nb)
+ goto fail;
+ uncore_nb->cpu = cpu;
+ uncore_nb->num_counters = NUM_COUNTERS_NB;
+ uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+ uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore_nb->active_mask = &amd_nb_active_mask;
+ uncore_nb->pmu = &amd_nb_pmu;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_L2;
- uncore->rdpmc_base = RDPMC_BASE_L2;
- uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore->active_mask = &amd_l2_active_mask;
- uncore->pmu = &amd_l2_pmu;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ uncore_l2 = amd_uncore_alloc(cpu);
+ if (!uncore_l2)
+ goto fail;
+ uncore_l2->cpu = cpu;
+ uncore_l2->num_counters = NUM_COUNTERS_L2;
+ uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+ uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_l2->active_mask = &amd_l2_active_mask;
+ uncore_l2->pmu = &amd_l2_pmu;
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
+
+ return 0;
+
+fail:
+ kfree(uncore_nb);
+ return -ENOMEM;
}
static struct amd_uncore *
@@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
if (!--uncore->refcnt)
kfree(uncore);
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+ *per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
@@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- amd_uncore_cpu_up_prepare(cpu);
+ if (amd_uncore_cpu_up_prepare(cpu))
+ return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
@@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy)
amd_uncore_cpu_online(cpu);
}
+static void cleanup_cpu_online(void *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ amd_uncore_cpu_dead(cpu);
+}
+
static int __init amd_uncore_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return -ENODEV;
+ goto fail_nodev;
if (!cpu_has_topoext)
- return -ENODEV;
+ goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (!amd_uncore_nb) {
+ ret = -ENOMEM;
+ goto fail_nb;
+ }
+ ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (ret)
+ goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
@@ -522,20 +546,28 @@ static int __init amd_uncore_init(void)
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (!amd_uncore_l2) {
+ ret = -ENOMEM;
+ goto fail_l2;
+ }
+ ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (ret)
+ goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
- return -ENODEV;
+ goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
- amd_uncore_cpu_up_prepare(cpu);
+ ret = amd_uncore_cpu_up_prepare(cpu);
+ if (ret)
+ goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
@@ -543,5 +575,30 @@ static int __init amd_uncore_init(void)
cpu_notifier_register_done();
return 0;
+
+
+fail_online:
+ for_each_online_cpu(cpu2) {
+ if (cpu2 == cpu)
+ break;
+ smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+ }
+ cpu_notifier_register_done();
+
+ /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+ amd_uncore_nb = amd_uncore_l2 = NULL;
+ if (cpu_has_perfctr_l2)
+ perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+ if (cpu_has_perfctr_nb)
+ perf_pmu_unregister(&amd_nb_pmu);
+ if (amd_uncore_l2)
+ free_percpu(amd_uncore_l2);
+fail_nb:
+ if (amd_uncore_nb)
+ free_percpu(amd_uncore_nb);
+
+fail_nodev:
+ return ret;
}
device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index ae6552a0701f..0939f86f543d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -945,7 +945,7 @@ static struct intel_uncore_type *snbep_pci_uncores[] = {
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
{ /* Home Agent */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
@@ -1510,7 +1510,7 @@ static struct intel_uncore_type *ivt_pci_uncores[] = {
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
+static const struct pci_device_id ivt_uncore_pci_ids[] = {
{ /* Home Agent 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
@@ -1985,7 +1985,7 @@ static struct intel_uncore_type *snb_pci_uncores[] = {
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
+static const struct pci_device_id snb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
@@ -1993,7 +1993,7 @@ static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
{ /* end: all zeroes */ },
};
-static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
@@ -2001,7 +2001,7 @@ static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
{ /* end: all zeroes */ },
};
-static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
@@ -2947,10 +2947,7 @@ again:
* extra registers. If we failed to take an extra
* register, try the alternative.
*/
- if (idx % 2)
- idx--;
- else
- idx++;
+ idx ^= 1;
if (idx != reg1->idx % 6) {
if (idx == 2)
config1 >>= 8;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 06fe3ed8b851..5433658e598d 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+ seq_printf(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index b6f794aa1693..4a8013d55947 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -38,7 +38,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 507de8066594..a618fcd2c07d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -4,9 +4,14 @@
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +21,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -28,6 +34,45 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+/* This primarily represents number of split ranges due to exclusion */
+#define CRASH_MAX_RANGES 16
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[CRASH_MAX_RANGES];
+};
+
+/* Misc data about ram ranges needed to prepare elf headers */
+struct crash_elf_data {
+ struct kimage *image;
+ /*
+ * Total number of ram ranges we have after various adjustments for
+ * GART, crash reserved region etc.
+ */
+ unsigned int max_nr_ranges;
+ unsigned long gart_start, gart_end;
+
+ /* Pointer to elf header */
+ void *ehdr;
+ /* Pointer to next phdr */
+ void *bufp;
+ struct crash_mem mem;
+};
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
int in_crash_kexec;
/*
@@ -39,6 +84,7 @@ int in_crash_kexec;
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -135,3 +181,518 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
+
+#ifdef CONFIG_KEXEC_FILE
+static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+ unsigned long nr_pfn, void *arg)
+{
+ int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+
+ ced->gart_start = start;
+ ced->gart_end = end;
+
+ /* Not expecting more than 1 gart aperture */
+ return 1;
+}
+
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static void fill_up_crash_elf_data(struct crash_elf_data *ced,
+ struct kimage *image)
+{
+ unsigned int nr_ranges = 0;
+
+ ced->image = image;
+
+ walk_system_ram_range(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+
+ ced->max_nr_ranges = nr_ranges;
+
+ /*
+ * We don't create ELF headers for GART aperture as an attempt
+ * to dump this memory in second kernel leads to hang/crash.
+ * If gart aperture is present, one needs to exclude that region
+ * and that could lead to need of extra phdr.
+ */
+ walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
+ ced, get_gart_ranges_callback);
+
+ /*
+ * If we have gart region, excluding that could potentially split
+ * a memory range, resulting in extra header. Account for that.
+ */
+ if (ced->gart_end)
+ ced->max_nr_ranges++;
+
+ /* Exclusion of crash region could split memory ranges */
+ ced->max_nr_ranges++;
+
+ /* If crashk_low_res is not 0, another range split possible */
+ if (crashk_low_res.end != 0)
+ ced->max_nr_ranges++;
+}
+
+static int exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happend, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == CRASH_MAX_RANGES - 1) {
+ pr_err("Too many crash ranges after split\n");
+ return -ENOMEM;
+ }
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in ced->mem.ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_elf_data *ced,
+ unsigned long long mstart, unsigned long long mend)
+{
+ struct crash_mem *cmem = &ced->mem;
+ int ret = 0;
+
+ memset(cmem->ranges, 0, sizeof(cmem->ranges));
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude crashkernel region */
+ ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ /* Exclude GART region */
+ if (ced->gart_end) {
+ ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long mstart, mend;
+ struct kimage *image = ced->image;
+ struct crash_mem *cmem;
+ int ret, i;
+
+ ehdr = ced->ehdr;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(ced, start, end);
+ if (ret)
+ return ret;
+
+ /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
+ cmem = &ced->mem;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ mstart = cmem->ranges[i].start;
+ mend = cmem->ranges[i].end;
+
+ phdr = ced->bufp;
+ ced->bufp += sizeof(Elf64_Phdr);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ if (mstart == image->arch.backup_src_start &&
+ (mend - mstart + 1) == image->arch.backup_src_sz)
+ phdr->p_offset = image->arch.backup_load_addr;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_headers(struct crash_elf_data *ced,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf, *bufp;
+ unsigned int cpu;
+ unsigned long long notes_addr;
+ int ret;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += ced->max_nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ bufp = buf;
+ ehdr = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+ (ehdr->e_phnum)++;
+
+#ifdef CONFIG_X86_64
+ /* Prepare PT_LOAD type program header for kernel text region */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ (ehdr->e_phnum)++;
+#endif
+
+ /* Prepare PT_LOAD headers for system ram chunks. */
+ ced->ehdr = ehdr;
+ ced->bufp = bufp;
+ ret = walk_system_ram_res(0, -1, ced,
+ prepare_elf64_ram_headers_callback);
+ if (ret < 0)
+ return ret;
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_elf_data *ced;
+ int ret;
+
+ ced = kzalloc(sizeof(*ced), GFP_KERNEL);
+ if (!ced)
+ return -ENOMEM;
+
+ fill_up_crash_elf_data(ced, image);
+
+ /* By default prepare 64bit headers */
+ ret = prepare_elf64_headers(ced, addr, sz);
+ kfree(ced);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820MAX)
+ return 1;
+
+ memcpy(&params->e820_map[nr_e820_entries], entry,
+ sizeof(struct e820entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820entry ei;
+
+ ei.addr = start;
+ ei.size = end - start + 1;
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_NVS;
+ walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(u64 start, u64 end, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = start;
+ image->arch.backup_src_sz = end - start + 1;
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ unsigned long src_start, src_sz, elf_sz;
+ void *elf_addr;
+ int ret;
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ src_start = image->arch.backup_src_start;
+ src_sz = image->arch.backup_src_sz;
+
+ /* Add backup segment. */
+ if (src_sz) {
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
+ sizeof(crash_zero_bytes), src_sz,
+ PAGE_SIZE, 0, -1, 0,
+ &image->arch.backup_load_addr);
+ if (ret)
+ return ret;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr, src_start, src_sz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = elf_addr;
+ image->arch.elf_headers_sz = elf_sz;
+
+ ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
+ ELF_CORE_HEADER_ALIGN, 0, -1, 0,
+ &image->arch.elf_load_addr);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, elf_sz, elf_sz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7db54b5d5f86..3d3503351242 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -21,6 +21,7 @@
#include <asm/apic.h>
#include <asm/pci_x86.h>
#include <asm/setup.h>
+#include <asm/i8259.h>
__initdata u64 initial_dtb;
char __initdata cmd_line[COMMAND_LINE_SIZE];
@@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void)
#ifdef CONFIG_X86_IO_APIC
static unsigned int ioapic_id;
-static void __init dtb_add_ioapic(struct device_node *dn)
-{
- struct resource r;
- int ret;
-
- ret = of_address_to_resource(dn, 0, &r);
- if (ret) {
- printk(KERN_ERR "Can't obtain address from node %s.\n",
- dn->full_name);
- return;
- }
- mp_register_ioapic(++ioapic_id, r.start, gsi_top);
-}
-
-static void __init dtb_ioapic_setup(void)
-{
- struct device_node *dn;
-
- for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
- dtb_add_ioapic(dn);
-
- if (nr_ioapics) {
- of_ioapic = 1;
- return;
- }
- printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
-}
-#else
-static void __init dtb_ioapic_setup(void) {}
-#endif
-
-static void __init dtb_apic_setup(void)
-{
- dtb_lapic_setup();
- dtb_ioapic_setup();
-}
-
-#ifdef CONFIG_OF_FLATTREE
-static void __init x86_flattree_get_config(void)
-{
- u32 size, map_len;
- void *dt;
-
- if (!initial_dtb)
- return;
-
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
-
- initial_boot_params = dt = early_memremap(initial_dtb, map_len);
- size = of_get_flat_dt_size();
- if (map_len < size) {
- early_iounmap(dt, map_len);
- initial_boot_params = dt = early_memremap(initial_dtb, size);
- map_len = size;
- }
-
- unflatten_and_copy_device_tree();
- early_iounmap(dt, map_len);
-}
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
-
-void __init x86_dtb_init(void)
-{
- x86_flattree_get_config();
-
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
-}
-
-#ifdef CONFIG_X86_IO_APIC
-
struct of_ioapic_type {
u32 out_type;
u32 trigger;
@@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain,
const u32 *intspec, u32 intsize,
irq_hw_number_t *out_hwirq, u32 *out_type)
{
- struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
- u32 line, idx;
- int rc;
+ u32 line, idx, gsi;
if (WARN_ON(intsize < 2))
return -EINVAL;
@@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain,
it = &of_ioapic_type[intspec[1]];
- idx = (u32) domain->host_data;
- set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
-
- rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
- cpu_to_node(0), &attr);
- if (rc)
- return rc;
+ idx = (u32)(long)domain->host_data;
+ gsi = mp_pin_to_gsi(idx, line);
+ if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
+ return -EBUSY;
*out_hwirq = line;
*out_type = it->out_type;
@@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain,
}
const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
.xlate = ioapic_xlate,
};
-static void dt_add_ioapic_domain(unsigned int ioapic_num,
- struct device_node *np)
+static void __init dtb_add_ioapic(struct device_node *dn)
{
- struct irq_domain *id;
- struct mp_ioapic_gsi *gsi_cfg;
+ struct resource r;
int ret;
- int num;
-
- gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
- num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
-
- id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
- (void *)ioapic_num);
- BUG_ON(!id);
- if (gsi_cfg->gsi_base == 0) {
- /*
- * The first NR_IRQS_LEGACY irq descs are allocated in
- * early_irq_init() and need just a mapping. The
- * remaining irqs need both. All of them are preallocated
- * and assigned so we can keep the 1:1 mapping which the ioapic
- * is having.
- */
- irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
-
- if (num > NR_IRQS_LEGACY) {
- ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
- NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
- if (ret)
- pr_err("Error creating mapping for the "
- "remaining IRQs: %d\n", ret);
- }
- irq_set_default_host(id);
- } else {
- ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
- if (ret)
- pr_err("Error creating IRQ mapping: %d\n", ret);
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ printk(KERN_ERR "Can't obtain address from node %s.\n",
+ dn->full_name);
+ return;
}
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
}
-static void __init ioapic_add_ofnode(struct device_node *np)
+static void __init dtb_ioapic_setup(void)
{
- struct resource r;
- int i, ret;
+ struct device_node *dn;
- ret = of_address_to_resource(np, 0, &r);
- if (ret) {
- printk(KERN_ERR "Failed to obtain address for %s\n",
- np->full_name);
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
return;
}
+ printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
- for (i = 0; i < nr_ioapics; i++) {
- if (r.start == mpc_ioapic_addr(i)) {
- dt_add_ioapic_domain(i, np);
- return;
- }
- }
- printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+static void __init dtb_apic_setup(void)
+{
+ dtb_lapic_setup();
+ dtb_ioapic_setup();
}
-void __init x86_add_irq_domains(void)
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
{
- struct device_node *dp;
+ u32 size, map_len;
+ void *dt;
- if (!of_have_populated_dt())
+ if (!initial_dtb)
return;
- for_each_node_with_property(dp, "interrupt-controller") {
- if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
- ioapic_add_ofnode(dp);
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+ size = of_get_flat_dt_size();
+ if (map_len < size) {
+ early_iounmap(dt, map_len);
+ initial_boot_params = dt = early_memremap(initial_dtb, size);
+ map_len = size;
}
+
+ unflatten_and_copy_device_tree();
+ early_iounmap(dt, map_len);
}
#else
-void __init x86_add_irq_domains(void) { }
+static inline void x86_flattree_get_config(void) { }
#endif
+
+void __init x86_dtb_init(void)
+{
+ x86_flattree_get_config();
+
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 0d0c9d4ab6d5..4b0e1dfa2226 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -683,7 +683,7 @@ END(syscall_badsys)
sysenter_badsys:
movl $-ENOSYS,%eax
jmp sysenter_after_call
-END(syscall_badsys)
+END(sysenter_badsys)
CFI_ENDPROC
.macro FIXUP_ESPFIX_STACK
@@ -1059,9 +1059,6 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1093,8 +1090,6 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller)
pushf /* push flags before compare (in cs location) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
/*
* i386 does not save SS and ESP when coming from kernel.
@@ -1153,7 +1148,6 @@ GLOBAL(ftrace_regs_call)
popf /* Pop flags at end (no addl to corrupt flags) */
jmp ftrace_ret
-ftrace_restore_flags:
popf
jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -1162,9 +1156,6 @@ ENTRY(mcount)
cmpl $__PAGE_OFFSET, %esp
jb ftrace_stub /* Paging not enabled yet? */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c844f0816ab8..2fac1343a90b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -207,7 +207,6 @@ ENDPROC(native_usergs_sysret64)
*/
.macro XCPT_FRAME start=1 offset=0
INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
.endm
/*
@@ -287,21 +286,21 @@ ENDPROC(native_usergs_sysret64)
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
movq_cfi rdx, RDX+8
movq_cfi rcx, RCX+8
movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -1387,21 +1386,21 @@ ENTRY(error_entry)
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
+ movq %rdx, RDX+8(%rsp)
+ movq %rcx, RCX+8(%rsp)
+ movq %rax, RAX+8(%rsp)
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1419,6 +1418,7 @@ error_sti:
* compat mode. Check for these here too.
*/
error_kernelspace:
+ CFI_REL_OFFSET rcx, RCX+8
incl %ebx
leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cbc4a91b131e..3386dc9aa333 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -703,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd80814419..a9a4229f6161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* These bits must be zero.
*/
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index d30acdc1229d..9030e83db6ee 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -202,7 +202,7 @@ static int iosf_mbi_probe(struct pci_dev *pdev,
return 0;
}
-static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+static const struct pci_device_id iosf_mbi_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
{ 0, },
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7f50156542fb..44f1ed42fdf2 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -78,7 +78,7 @@ void __init init_ISA_irqs(void)
#endif
legacy_pic->init(0);
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ for (i = 0; i < nr_legacy_irqs(); i++)
irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
}
@@ -87,12 +87,6 @@ void __init init_IRQ(void)
int i;
/*
- * We probably need a better place for this, but it works for
- * now ...
- */
- x86_add_irq_domains();
-
- /*
* On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
* If these IRQ's are handled by legacy interrupt-controllers like PIC,
* then this configuration will likely be static after the boot. If
@@ -100,7 +94,7 @@ void __init init_IRQ(void)
* then this vector space can be freed and re-used dynamically as the
* irq's migrate etc.
*/
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
x86_init.irqs.intr_init();
@@ -121,7 +115,7 @@ void setup_vector_irq(int cpu)
* legacy PIC, for the new cpu that is coming online, setup the static
* legacy vector to irq mapping:
*/
- for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
#endif
@@ -209,7 +203,7 @@ void __init native_init_IRQ(void)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
- if (!acpi_ioapic && !of_ioapic)
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..9642b9b33655
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,553 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verify_pefile.h>
+#include <keys/system_keyring.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ memcpy(cmdline_ptr, cmdline, cmdline_len);
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr + cmdline_len - 1,
+ " elfcorehdr=0x%lx", image->arch.elf_load_addr);
+ cmdline_len += len;
+ }
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_saved.nr_map;
+
+ /* TODO: Pass entries more than E820MAX in bootparams setup data */
+ if (nr_e820_entries > E820MAX)
+ nr_e820_entries = E820MAX;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_map, &e820_saved.map,
+ nr_e820_entries * sizeof(struct e820entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ /* Default sysdesc table */
+ params->sys_desc_table.length = 0;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_map[i].type != E820_RAM)
+ continue;
+ start = params->e820_map[i].addr;
+ end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be atleast two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ unsigned long purgatory_load_addr;
+ unsigned long kernel_bufsz, kernel_memsz, kernel_align;
+ char *kernel_buf;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
+ &purgatory_load_addr);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ efi_map_sz = ALIGN(efi_map_sz, 16);
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ params_misc_sz = params_cmdline_sz + efi_map_sz +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(params_misc_sz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + efi_map_sz;
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
+ params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
+ ULONG_MAX, 1, &bootparam_load_addr);
+ if (ret)
+ goto out_free_params;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, params_misc_sz, params_misc_sz);
+
+ /* Load kernel */
+ kernel_buf = kernel + kern16_size;
+ kernel_bufsz = kernel_len - kern16_size;
+ kernel_memsz = PAGE_ALIGN(header->init_size);
+ kernel_align = header->kernel_alignment;
+
+ ret = kexec_add_buffer(image, kernel_buf,
+ kernel_bufsz, kernel_memsz, kernel_align,
+ MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
+ &kernel_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kernel_memsz, kernel_memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
+ PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
+ ULONG_MAX, 1, &initrd_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ bool trusted;
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ system_trusted_keyring, &trusted);
+ if (ret < 0)
+ return ret;
+ if (!trusted)
+ return -EKEYREJECTED;
+ return 0;
+}
+#endif
+
+struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index f304773285ae..f1314d0bcf0a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -338,8 +338,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
* a relative jump.
*/
rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
+ if (abs(rel) > 0x7fffffff) {
+ __arch_remove_optimized_kprobe(op, 0);
return -ERANGE;
+ }
buf = (u8 *)op->optinsn.insn;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791cd..485981059a40 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -21,6 +23,13 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+
+#ifdef CONFIG_KEXEC_FILE
+static struct kexec_file_ops *kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+};
+#endif
static void free_transition_pgtable(struct kimage *image)
{
@@ -171,6 +180,45 @@ static void load_segments(void)
);
}
+#ifdef CONFIG_KEXEC_FILE
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+#else /* !CONFIG_KEXEC_FILE */
+static inline int arch_update_purgatory(struct kimage *image)
+{
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -184,6 +232,11 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
return 0;
}
@@ -283,3 +336,200 @@ void arch_crash_save_vmcoreinfo(void)
(unsigned long)&_text - __START_KERNEL);
}
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ int i, ret = -ENOEXEC;
+ struct kexec_file_ops *fops;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+ fops = kexec_file_loaders[i];
+ if (!fops || !fops->probe)
+ continue;
+
+ ret = fops->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
+ unsigned long kernel_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(kernel, kernel_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * ehdr: Pointer to elf headers
+ * sechdrs: Pointer to section headers.
+ * relsec: section index of SHT_RELA section.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
+ Elf64_Shdr *sechdrs, unsigned int relsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ Elf64_Shdr *section, *symtabsec;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+
+ /*
+ * ->sh_offset has been modified to keep the pointer to section
+ * contents in memory
+ */
+ rel = (void *)sechdrs[relsec].sh_offset;
+
+ /* Section to which relocations apply */
+ section = &sechdrs[sechdrs[relsec].sh_info];
+
+ pr_debug("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+
+ /* Associated symbol table */
+ symtabsec = &sechdrs[sechdrs[relsec].sh_link];
+
+ /* String table */
+ if (symtabsec->sh_link >= ehdr->e_shnum) {
+ /* Invalid strtab section number */
+ pr_err("Invalid string table section index %d\n",
+ symtabsec->sh_link);
+ return -ENOEXEC;
+ }
+
+ strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+
+ /* section header string table */
+ shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update (->sh_offset). This is temporary
+ * buffer where section is currently loaded. This will finally
+ * be loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = (void *)(section->sh_offset + rel[i].r_offset);
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (Elf64_Sym *)symtabsec->sh_offset +
+ ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index c050a0153168..c73aecf10d34 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -46,10 +46,6 @@ END(function_hook)
.endm
ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
ftrace_caller_setup
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
@@ -73,10 +69,6 @@ ENTRY(ftrace_regs_caller)
/* Save the current flags before compare (in SS location)*/
pushfq
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
/* skip=8 to skip flags saved in SS */
ftrace_caller_setup 8
@@ -131,7 +123,7 @@ GLOBAL(ftrace_regs_call)
popfq
jmp ftrace_return
-ftrace_restore_flags:
+
popfq
jmp ftrace_stub
@@ -141,9 +133,6 @@ END(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d2b56489d70f..2d2a237f2c73 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
+#include <linux/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
boot_cpu_physical_apicid = m->apicid;
}
- printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
generic_processor_info(apicid, m->apicver);
}
@@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
#if MAX_MP_BUSSES < 256
if (m->busid >= MAX_MP_BUSSES) {
- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
- " is too large, max. supported is %d\n",
- m->busid, str, MAX_MP_BUSSES - 1);
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
@@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m)
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
#endif
} else
- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+ pr_warn("Unknown bustype %s - ignoring\n", str);
}
+static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
+};
+
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
if (m->flags & MPC_APIC_USABLE)
- mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
}
static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
mp_irq->irqtype, mp_irq->irqflag & 3,
(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
@@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
- apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
m->srcbusirq, m->destapic, m->destapiclint);
}
@@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
- printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->signature[0], mpc->signature[1],
mpc->signature[2], mpc->signature[3]);
return 0;
}
if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
- printk(KERN_ERR "MPTABLE: checksum error!\n");
+ pr_err("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->spec != 0x01 && mpc->spec != 0x04) {
- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->spec);
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
return 0;
}
if (!mpc->lapic) {
- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+ pr_err("MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(oem, mpc->oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->productid, 12);
str[12] = 0;
- printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+ pr_info("MPTABLE: Product ID: %s\n", str);
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
return 1;
}
@@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
{
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
- "type %x\n", *mpt);
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
1, mpc, mpc->length, 1);
}
@@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
-#ifdef CONFIG_X86_32
- generic_mps_oem_check(mpc, oem, str);
-#endif
/* Initialize the lapic mapping */
if (!acpi_lapic)
register_lapic_address(mpc->lapic);
@@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
}
if (!num_processors)
- printk(KERN_ERR "MPTABLE: no processors registered!\n");
+ pr_err("MPTABLE: no processors registered!\n");
return num_processors;
}
@@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
- "falling back to ELCR\n");
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
ELCR_trigger(13))
- printk(KERN_ERR "ELCR contains invalid data... "
- "not using ELCR\n");
+ pr_err("ELCR contains invalid data... not using ELCR\n");
else {
- printk(KERN_INFO
- "Using ELCR to identify PCI interrupts\n");
+ pr_info("Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
@@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type)
bus.busid = 0;
switch (mpc_default_type) {
default:
- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ pr_err("???\nUnknown standard configuration %d\n",
mpc_default_type);
/* fall through */
case 1:
@@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
- "... disabling SMP support. (tell your hw vendor)\n");
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
early_iounmap(mpc, size);
return -1;
}
@@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
if (!mp_irq_entries) {
struct mpc_bus bus;
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. (tell your hw vendor)\n");
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
bus.type = MP_BUS;
bus.busid = 0;
@@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->specification);
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->feature2 & (1 << 7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+ pr_info(" IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+ pr_info(" Virtual Wire compatibility mode.\n");
pic_mode = 0;
}
#endif
@@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early)
return;
}
- printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->feature1);
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
@@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early)
BUG();
if (!early)
- printk(KERN_INFO "Processors: %d\n", num_processors);
+ pr_info("Processors: %d\n", num_processors);
/*
* Only use the first configuration found.
*/
@@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
- (unsigned long long) virt_to_phys(mpf),
- (unsigned long long) virt_to_phys(mpf) +
- sizeof(*mpf) - 1, mpf);
+ pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+ (unsigned long long) virt_to_phys(mpf),
+ (unsigned long long) virt_to_phys(mpf) +
+ sizeof(*mpf) - 1, mpf);
mem = virt_to_phys(mpf);
memblock_reserve(mem, sizeof(*mpf));
@@ -735,7 +736,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ pr_info("mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -862,13 +863,13 @@ static int __init update_mp_table(void)
if (!smp_check_mpc(mpc, oem, str))
return 0;
- printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
- printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+ pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+ pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
- printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
- mpc_new_length);
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
if (!mpc_new_phys) {
@@ -879,10 +880,10 @@ static int __init update_mp_table(void)
mpc->checksum = 0xff;
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
- printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
return 0;
}
- printk(KERN_INFO "use in-position replacing\n");
+ pr_info("use in-position replacing\n");
} else {
mpf->physptr = mpc_new_phys;
mpc_new = phys_to_virt(mpc_new_phys);
@@ -892,7 +893,7 @@ static int __init update_mp_table(void)
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
- printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+ pr_info("mpf new: %x\n", 0x400 - 16);
mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
mpf = mpf_new;
@@ -900,7 +901,7 @@ static int __init update_mp_table(void)
}
mpf->checksum = 0;
mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+ pr_info("physptr new: %x\n", mpf->physptr);
}
/*
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
new file mode 100644
index 000000000000..0c424a67985d
--- /dev/null
+++ b/arch/x86/kernel/pmc_atom.c
@@ -0,0 +1,321 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+
+#include <asm/pmc_atom.h>
+
+#define DRIVER_NAME KBUILD_MODNAME
+
+struct pmc_dev {
+ u32 base_addr;
+ void __iomem *regmap;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+struct pmc_dev_map {
+ const char *name;
+ u32 bit_mask;
+};
+
+static const struct pmc_dev_map dev_map[] = {
+ {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
+ {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
+ {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
+ {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
+ {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
+ {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
+ {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
+ {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
+ {"8 - SCC_EMMC", BIT_SCC_EMMC},
+ {"9 - SCC_SDIO", BIT_SCC_SDIO},
+ {"10 - SCC_SDCARD", BIT_SCC_SDCARD},
+ {"11 - SCC_MIPI", BIT_SCC_MIPI},
+ {"12 - HDA", BIT_HDA},
+ {"13 - LPE", BIT_LPE},
+ {"14 - OTG", BIT_OTG},
+ {"15 - USH", BIT_USH},
+ {"16 - GBE", BIT_GBE},
+ {"17 - SATA", BIT_SATA},
+ {"18 - USB_EHCI", BIT_USB_EHCI},
+ {"19 - SEC", BIT_SEC},
+ {"20 - PCIE_PORT0", BIT_PCIE_PORT0},
+ {"21 - PCIE_PORT1", BIT_PCIE_PORT1},
+ {"22 - PCIE_PORT2", BIT_PCIE_PORT2},
+ {"23 - PCIE_PORT3", BIT_PCIE_PORT3},
+ {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
+ {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
+ {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
+ {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
+ {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
+ {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
+ {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
+ {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
+ {"32 - SMB", BIT_SMB},
+ {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY},
+ {"34 - USH_SS_PHY", BIT_USH_SS_PHY},
+ {"35 - DFX", BIT_DFX},
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+ return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+ writel(val, pmc->regmap + reg_offset);
+}
+
+static void pmc_power_off(void)
+{
+ u16 pm1_cnt_port;
+ u32 pm1_cnt_value;
+
+ pr_info("Preparing to enter system sleep state S5\n");
+
+ pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+ pm1_cnt_value = inl(pm1_cnt_port);
+ pm1_cnt_value &= SLEEP_TYPE_MASK;
+ pm1_cnt_value |= SLEEP_TYPE_S5;
+ pm1_cnt_value |= SLEEP_ENABLE;
+
+ outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+ /*
+ * Disable PMC S0IX_WAKE_EN events coming from:
+ * - LPC clock run
+ * - GPIO_SUS ored dedicated IRQs
+ * - GPIO_SCORE ored dedicated IRQs
+ * - GPIO_SUS shared IRQ
+ * - GPIO_SCORE shared IRQ
+ */
+ pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 func_dis, func_dis_2, func_dis_index;
+ u32 d3_sts_0, d3_sts_1, d3_sts_index;
+ int dev_num, dev_index, reg_index;
+
+ func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+ func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+ d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+ d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+ dev_num = ARRAY_SIZE(dev_map);
+
+ for (dev_index = 0; dev_index < dev_num; dev_index++) {
+ reg_index = dev_index / PMC_REG_BIT_WIDTH;
+ if (reg_index) {
+ func_dis_index = func_dis_2;
+ d3_sts_index = d3_sts_1;
+ } else {
+ func_dis_index = func_dis;
+ d3_sts_index = d3_sts_0;
+ }
+
+ seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
+ dev_map[dev_index].name,
+ dev_map[dev_index].bit_mask & func_dis_index ?
+ "Disabled" : "Enabled ",
+ dev_map[dev_index].bit_mask & d3_sts_index ?
+ "D3" : "D0");
+ }
+ return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+ .open = pmc_dev_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+ s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+ s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+ s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+ s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+ s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+ seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+ seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+ seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+ seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+ seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr);
+ return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+ .open = pmc_sleep_tmr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+ if (!pmc->dbgfs_dir)
+ return;
+
+ debugfs_remove_recursive(pmc->dbgfs_dir);
+ pmc->dbgfs_dir = NULL;
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+{
+ struct dentry *dir, *f;
+
+ dir = debugfs_create_dir("pmc_atom", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_dev_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "dev_states register failed\n");
+ goto err;
+ }
+ f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_sleep_tmr_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "sleep_state register failed\n");
+ goto err;
+ }
+ pmc->dbgfs_dir = dir;
+ return 0;
+err:
+ pmc_dbgfs_unregister(pmc);
+ return -ENODEV;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev)
+{
+ struct pmc_dev *pmc = &pmc_device;
+ int ret;
+
+ /* Obtain ACPI base address */
+ pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+ acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+ /* Install power off function */
+ if (acpi_base_addr != 0 && pm_power_off == NULL)
+ pm_power_off = pmc_power_off;
+
+ pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+ pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+ pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+ if (!pmc->regmap) {
+ dev_err(&pdev->dev, "error: ioremap failed\n");
+ return -ENOMEM;
+ }
+
+ /* PMC hardware registers setup */
+ pmc_hw_reg_setup(pmc);
+
+#ifdef CONFIG_DEBUG_FS
+ ret = pmc_dbgfs_register(pmc, pdev);
+ if (ret) {
+ iounmap(pmc->regmap);
+ return ret;
+ }
+#endif /* CONFIG_DEBUG_FS */
+ return 0;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * This data only exists for exporting the supported
+ * PCI ids via MODULE_DEVICE_TABLE. We do not actually
+ * register a pci_driver, because lpc_ich will register
+ * a driver on the same PCI id.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+ { 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
+
+static int __init pmc_atom_init(void)
+{
+ int err = -ENODEV;
+ struct pci_dev *pdev = NULL;
+ const struct pci_device_id *ent;
+
+ /* We look for our device - PCU PMC
+ * we assume that there is max. one device.
+ *
+ * We can't use plain pci_driver mechanism,
+ * as the device is really a multiple function device,
+ * main driver that binds to the pci_device is lpc_ich
+ * and have to find & bind to the device this way.
+ */
+ for_each_pci_dev(pdev) {
+ ent = pci_match_id(pmc_pci_ids, pdev);
+ if (ent) {
+ err = pmc_setup_dev(pdev);
+ goto out;
+ }
+ }
+ /* Device not found. */
+out:
+ return err;
+}
+
+module_init(pmc_atom_init);
+/* no module_exit, this driver shouldn't be unloaded */
+
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..f804dc935d2a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -93,6 +93,7 @@ void arch_task_cache_init(void)
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
SLAB_PANIC | SLAB_NOTRACK, NULL);
+ setup_xstate_comp();
}
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 52b1157c53eb..17962e667a91 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -28,6 +28,7 @@
#include <linux/mc146818rtc.h>
#include <asm/realmode.h>
#include <asm/x86_init.h>
+#include <asm/efi.h>
/*
* Power off function, if any
@@ -401,12 +402,25 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
static int __init reboot_init(void)
{
+ int rv;
+
/*
* Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default)
- dmi_check_system(reboot_dmi_table);
+ if (!reboot_default)
+ return 0;
+
+ /*
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set, force EFI
+ * reboot.
+ */
+ rv = dmi_check_system(reboot_dmi_table);
+
+ if (!rv && efi_reboot_required())
+ reboot_type = BOOT_EFI;
+
return 0;
}
core_initcall(reboot_init);
@@ -528,11 +542,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_EFI:
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi.reset_system(reboot_mode == REBOOT_WARM ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
+ efi_reboot(reboot_mode, NULL);
reboot_type = BOOT_BIOS;
break;
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819bb6a8..80eab01c1a68 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 78a0e6298922..41ead8d3bc0b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -924,10 +924,10 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4)) {
+ EFI32_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4)) {
+ EFI64_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5492798930ef..42a2dca984b3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -168,10 +168,6 @@ static void smp_callin(void)
* CPU, first the APIC. (this is probably redundant on most
* boards)
*/
-
- pr_debug("CALLIN, before setup_local_APIC()\n");
- if (apic->smp_callin_clear_local_apic)
- apic->smp_callin_clear_local_apic();
setup_local_APIC();
end_local_APIC_setup();
@@ -1143,10 +1139,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
enable_IO_APIC();
bsp_end_local_APIC_setup();
-
- if (apic->setup_portio_remap)
- apic->setup_portio_remap();
-
smpboot_setup_io_apic();
/*
* Set up local APIC timer on boot CPU.
@@ -1292,6 +1284,9 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_sibling_mask(cpu))
cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_sibling_mask(cpu));
cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index bf7ef5ce29df..0fa29609b2c4 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -68,6 +68,8 @@ static struct irqaction irq0 = {
void __init setup_default_timer_irq(void)
{
+ if (!nr_legacy_irqs())
+ return;
setup_irq(0, &irq0);
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ea030319b321..b6025f9e36c6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-/* XXX surely we already have this someplace in the kernel?! */
-#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
-
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
@@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_mul =
+ DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
+ cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
@@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc);
static struct clocksource clocksource_tsc;
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
@@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc;
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
static cycle_t read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static void resume_tsc(struct clocksource *cs)
-{
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
- clocksource_tsc.cycle_last = 0;
+ return (cycle_t)get_cycles();
}
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
- .resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index b99b9ad8540c..ee22c1d93ae5 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void)
is_vsmp = 1;
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
if (is_vsmp != -1)
return is_vsmp;
@@ -166,7 +166,7 @@ int is_vsmp_box(void)
static void __init detect_vsmp_box(void)
{
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ea5b5709aa76..e1e1e80fc6a6 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -81,10 +81,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index 9531fbb123ba..c7d791f32b98 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->cycle_last = tk->clock->cycle_last;
- vdata->mask = tk->clock->mask;
- vdata->mult = tk->mult;
- vdata->shift = tk->shift;
+ vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr.cycle_last;
+ vdata->mask = tk->tkr.mask;
+ vdata->mult = tk->tkr.mult;
+ vdata->shift = tk->tkr.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->shift);
+ << tk->tkr.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
+ tk->tkr.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c6addf..940b142cc11f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/compat.h>
+#include <linux/cpu.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
struct xsave_struct *init_xstate_buf;
static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
/*
* If a processor implementation discern that a processor state component is
@@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
if (use_xsave()) {
/* These bits must be zero. */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
/*
* Init the state that is not present in the memory
@@ -479,6 +482,52 @@ static void __init setup_xstate_features(void)
}
/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+ if (!cpu_has_xsaves) {
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > 2)
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ }
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
@@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
+ if (cpu_has_xsaves) {
+ init_xstate_buf->xsave_hdr.xcomp_bv =
+ (u64)1 << 63 | pcntxt_mask;
+ init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+ }
+
/*
* Init all the features state with header_bv being 0x0
*/
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state_booting(init_xstate_buf, -1);
/*
* Dump the init state again. This is to identify the init state
* of any feature which is not represented by all zero's.
*/
- xsave_state(init_xstate_buf, -1);
+ xsave_state_booting(init_xstate_buf, -1);
}
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s)
}
__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ int i;
+
+ if (!cpu_has_xsaves) {
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xstate_size = ebx;
+ return;
+ }
+
+ xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for (i = 2; i < 64; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ xstate_size += eax;
+ }
+ }
+}
+
/*
* Enable and initialize the xsave feature.
*/
@@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void)
/*
* Recompute the context size for enabled features
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
+ init_xstate_size();
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
@@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void)
}
}
- pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
+ pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+ pcntxt_mask, xstate_size,
+ cpu_has_xsaves ? "compacted form" : "standard form");
}
/*
@@ -635,3 +714,26 @@ void eager_fpu_init(void)
else
fxrstor_checking(&init_xstate_buf->i387);
}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ * xsave: base address of the xsave area;
+ * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ * etc.)
+ * Output:
+ * address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+ int feature = fls64(xstate) - 1;
+ if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+ return NULL;
+
+ return (void *)xsave + xstate_comp_offsets[feature];
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 287e4c85fff9..f9d16ff56c6b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f9087315e0cd..a5380590ab0e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -95,4 +95,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
return best && (best->edx & bit(X86_FEATURE_GBPAGES));
}
+
+static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_RTM));
+}
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e4e833d3d7d7..03954f7900f5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -162,6 +162,10 @@
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define NoBigReal ((u64)1 << 50) /* No big real mode */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -426,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
.modrm_reg = ctxt->modrm_reg,
.modrm_rm = ctxt->modrm_rm,
.src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
.src_bytes = ctxt->src.bytes,
.dst_bytes = ctxt->dst.bytes,
.ad_bytes = ctxt->ad_bytes,
@@ -511,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
return desc->g ? (limit << 12) | 0xfff : limit;
}
-static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
-{
- ctxt->has_seg_override = true;
- ctxt->seg_override = seg;
-}
-
static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
@@ -525,14 +524,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
-{
- if (!ctxt->has_seg_override)
- return 0;
-
- return ctxt->seg_override;
-}
-
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
@@ -651,7 +642,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (!fetch && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
- if ((desc.type & 8) || !(desc.type & 4)) {
+ if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
+ (ctxt->d & NoBigReal)) {
+ /* la is between zero and 0xffff */
+ if (la > 0xffff || (u32)(la + size - 1) > 0xffff)
+ goto bad;
+ } else if ((desc.type & 8) || !(desc.type & 4)) {
/* expand-up segment */
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
@@ -716,68 +712,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
}
/*
- * Fetch the next byte of the instruction being emulated which is pointed to
- * by ctxt->_eip, then increment ctxt->_eip.
- *
- * Also prefetch the remaining bytes of the instruction without crossing page
+ * Prefetch the remaining bytes of the instruction without crossing page
* boundary if they are not in fetch_cache yet.
*/
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
{
- struct fetch_cache *fc = &ctxt->fetch;
int rc;
- int size, cur_size;
-
- if (ctxt->_eip == fc->end) {
- unsigned long linear;
- struct segmented_address addr = { .seg = VCPU_SREG_CS,
- .ea = ctxt->_eip };
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size,
- PAGE_SIZE - offset_in_page(ctxt->_eip));
- rc = __linearize(ctxt, addr, size, false, true, &linear);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
- size, &ctxt->exception);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- fc->end += size;
- }
- *dest = fc->data[ctxt->_eip - fc->start];
- ctxt->_eip++;
- return X86EMUL_CONTINUE;
-}
+ unsigned size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ size = 15UL ^ cur_size;
+ rc = __linearize(ctxt, addr, size, false, true, &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- void *dest, unsigned size)
-{
- int rc;
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
- /* x86 instructions are limited to 15 bytes. */
- if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
return X86EMUL_UNHANDLEABLE;
- while (size--) {
- rc = do_insn_fetch_byte(ctxt, dest++);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- }
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
return X86EMUL_CONTINUE;
}
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size))
+ return __do_insn_fetch_bytes(ctxt, size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _ctxt) \
-({ unsigned long _x; \
- rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
- (_type)_x; \
+ ctxt->_eip += sizeof(_type); \
+ _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
})
#define insn_fetch_arr(_arr, _size, _ctxt) \
-({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
})
/*
@@ -1063,19 +1062,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
u8 sib;
- int index_reg = 0, base_reg = 0, scale;
+ int index_reg, base_reg, scale;
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- if (ctxt->rex_prefix) {
- ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
- ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
- }
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
- ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
- ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
ctxt->modrm_seg = VCPU_SREG_DS;
if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
@@ -1093,7 +1090,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if (ctxt->d & Mmx) {
op->type = OP_MM;
op->bytes = 8;
- op->addr.xmm = ctxt->modrm_rm & 7;
+ op->addr.mm = ctxt->modrm_rm & 7;
return rc;
}
fetch_register_operand(op);
@@ -1190,6 +1187,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
}
}
op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
done:
return rc;
}
@@ -1220,12 +1220,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
long sv = 0, mask;
if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
- mask = ~(ctxt->dst.bytes * 8 - 1);
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
if (ctxt->src.bytes == 2)
sv = (s16)ctxt->src.val & (s16)mask;
else if (ctxt->src.bytes == 4)
sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
ctxt->dst.addr.mem.ea += (sv >> 3);
}
@@ -1315,8 +1317,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
in_page = (ctxt->eflags & EFLG_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
@@ -1358,17 +1359,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_ptr *dt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
if (selector & 1 << 2) {
struct desc_struct desc;
u16 sel;
memset (dt, 0, sizeof *dt);
- if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = get_desc_base(&desc);
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
} else
ops->get_gdt(ctxt, dt);
}
@@ -1422,6 +1425,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
ulong desc_addr;
int ret;
u16 dummy;
+ u32 base3 = 0;
memset(&seg_desc, 0, sizeof seg_desc);
@@ -1487,9 +1491,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
break;
case VCPU_SREG_CS:
- if (in_task_switch && rpl != dpl)
- goto exception;
-
if (!(seg_desc.type & 8))
goto exception;
@@ -1538,9 +1539,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
ret = write_segment_descriptor(ctxt, selector, &seg_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
+ sizeof(base3), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
- ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
@@ -1575,34 +1581,28 @@ static void write_register_operand(struct operand *op)
static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
{
- int rc;
-
switch (op->type) {
case OP_REG:
write_register_operand(op);
break;
case OP_MEM:
if (ctxt->lock_prefix)
- rc = segmented_cmpxchg(ctxt,
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
op->addr.mem,
- &op->orig_val,
&op->val,
op->bytes);
- else
- rc = segmented_write(ctxt,
- op->addr.mem,
- &op->val,
- op->bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
break;
case OP_MEM_STR:
- rc = segmented_write(ctxt,
- op->addr.mem,
- op->data,
- op->bytes * op->count);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
break;
case OP_XMM:
write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
@@ -1671,7 +1671,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
switch(ctxt->mode) {
case X86EMUL_MODE_PROT64:
@@ -1754,6 +1754,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
}
@@ -1991,6 +1994,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
u64 old = ctxt->dst.orig_val64;
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
@@ -2017,6 +2023,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long cs;
+ int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2026,6 +2033,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -2044,8 +2054,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
{
/* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
@@ -2055,6 +2067,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
/* Failure: write the value we saw to EAX. */
ctxt->dst.type = OP_REG;
ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
}
@@ -2194,7 +2207,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
*reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2202,14 +2215,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->_eip = msr_data;
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
- ctxt->eflags &= ~(msr_data | EFLG_RF);
+ ctxt->eflags &= ~msr_data;
#endif
} else {
/* legacy mode */
ops->get_msr(ctxt, MSR_STAR, &msr_data);
ctxt->_eip = (u32)msr_data;
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
}
return X86EMUL_CONTINUE;
@@ -2258,7 +2271,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
break;
}
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
cs_sel = (u16)msr_data;
cs_sel &= ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2964,7 +2977,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
return X86EMUL_CONTINUE;
}
@@ -3221,7 +3234,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
static int em_smsw(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.bytes = 2;
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
return X86EMUL_CONTINUE;
}
@@ -3496,7 +3510,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- (rcx > 3))
+ ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -3521,9 +3535,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
}
#define D(_y) { .flags = (_y) }
-#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
-#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define N D(NotImpl)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
@@ -3532,10 +3546,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
#define II(_f, _e, _i) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
#define IIP(_f, _e, _i, _p) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
@@ -3634,8 +3648,8 @@ static const struct opcode group6[] = {
};
static const struct group_dual group7 = { {
- II(Mov | DstMem | Priv, em_sgdt, sgdt),
- II(Mov | DstMem | Priv, em_sidt, sidt),
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
II(SrcMem | Priv, em_lgdt, lgdt),
II(SrcMem | Priv, em_lidt, lidt),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3899,7 +3913,7 @@ static const struct opcode twobyte_table[256] = {
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
- X16(D(DstReg | SrcMem | ModRM | Mov)),
+ X16(D(DstReg | SrcMem | ModRM)),
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
@@ -4061,12 +4075,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
mem_common:
*op = ctxt->memop;
ctxt->memopp = op;
- if ((ctxt->d & BitOp) && op == &ctxt->dst)
+ if (ctxt->d & BitOp)
fetch_bit_operand(ctxt);
op->orig_val = op->val;
break;
case OpMem64:
- ctxt->memop.bytes = 8;
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
goto mem_common;
case OpAcc:
op->type = OP_REG;
@@ -4150,7 +4164,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
op->count = 1;
break;
@@ -4161,7 +4175,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
register_address(ctxt,
reg_read(ctxt, VCPU_REGS_RBX) +
(reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
break;
case OpImmFAddr:
@@ -4208,16 +4222,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
bool op_prefix = false;
+ bool has_seg_override = false;
struct opcode opcode;
ctxt->memop.type = OP_NONE;
ctxt->memopp = NULL;
ctxt->_eip = ctxt->eip;
- ctxt->fetch.start = ctxt->_eip;
- ctxt->fetch.end = ctxt->fetch.start + insn_len;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -4261,11 +4281,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
case 0x2e: /* CS override */
case 0x36: /* SS override */
case 0x3e: /* DS override */
- set_seg_override(ctxt, (ctxt->b >> 3) & 3);
+ has_seg_override = true;
+ ctxt->seg_override = (ctxt->b >> 3) & 3;
break;
case 0x64: /* FS override */
case 0x65: /* GS override */
- set_seg_override(ctxt, ctxt->b & 7);
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->b & 7;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -4314,6 +4336,13 @@ done_prefixes:
if (ctxt->d & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 ||
+ (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+ ctxt->d = NotImpl;
+ }
+
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -4356,49 +4385,59 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
- ctxt->execute = opcode.u.execute;
- ctxt->check_perm = opcode.check_perm;
- ctxt->intercept = opcode.intercept;
-
/* Unrecognised? */
- if (ctxt->d == 0 || (ctxt->d & NotImpl))
+ if (ctxt->d == 0)
return EMULATION_FAILED;
- if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
+ ctxt->execute = opcode.u.execute;
+
+ if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
return EMULATION_FAILED;
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
- if (ctxt->d & Op3264) {
- if (mode == X86EMUL_MODE_PROT64)
+ if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
ctxt->op_bytes = 8;
- else
- ctxt->op_bytes = 4;
- }
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
- else if (ctxt->d & Mmx)
- ctxt->op_bytes = 8;
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
/* ModRM and SIB bytes. */
if (ctxt->d & ModRM) {
rc = decode_modrm(ctxt, &ctxt->memop);
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, ctxt->modrm_seg);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
} else if (ctxt->d & MemAbs)
rc = decode_abs(ctxt, &ctxt->memop);
if (rc != X86EMUL_CONTINUE)
goto done;
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, VCPU_SREG_DS);
-
- ctxt->memop.addr.mem.seg = seg_override(ctxt);
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
- if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
- ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
/*
* Decode and fetch the source operand: register, memory
@@ -4420,7 +4459,7 @@ done_prefixes:
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
done:
- if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+ if (ctxt->rip_relative)
ctxt->memopp->addr.mem.ea += ctxt->_eip;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4495,6 +4534,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
return X86EMUL_CONTINUE;
}
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ memset(&ctxt->rip_relative, 0,
+ (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -4503,12 +4552,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->mem_read.pos = 0;
- if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
- (ctxt->d & Undefined)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
/* LOCK prefix is allowed only with some instructions */
if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
rc = emulate_ud(ctxt);
@@ -4520,69 +4563,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
- || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
- rc = emulate_nm(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
- if (ctxt->d & Mmx) {
- rc = flush_pending_x87_faults(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
goto done;
- /*
- * Now that we know the fpu is exception safe, we can fetch
- * operands from it.
- */
- fetch_possible_mmx_operand(ctxt, &ctxt->src);
- fetch_possible_mmx_operand(ctxt, &ctxt->src2);
- if (!(ctxt->d & Mov))
- fetch_possible_mmx_operand(ctxt, &ctxt->dst);
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_PRE_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
goto done;
- }
+ }
- /* Privileged instruction can be executed only in CPL=0 */
- if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
- rc = emulate_ud(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
- /* Do instruction specific permission checks */
- if (ctxt->check_perm) {
- rc = ctxt->check_perm(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
goto done;
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_POST_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
goto done;
- }
+ }
- if (ctxt->rep_prefix && (ctxt->d & String)) {
- /* All REP prefixes have the same first termination condition */
- if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
- ctxt->eip = ctxt->_eip;
- goto done;
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~EFLG_RF;
+ goto done;
+ }
}
}
@@ -4616,13 +4672,18 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= EFLG_RF;
+ else
+ ctxt->eflags &= ~EFLG_RF;
+
if (ctxt->execute) {
if (ctxt->d & Fastop) {
void (*fop)(struct fastop *) = (void *)ctxt->execute;
@@ -4657,8 +4718,9 @@ special_insn:
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
- break;
- rc = em_xchg(ctxt);
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
break;
case 0x98: /* cbw/cwde/cdqe */
switch (ctxt->op_bytes) {
@@ -4709,17 +4771,17 @@ special_insn:
goto done;
writeback:
- if (!(ctxt->d & NoWrite)) {
- rc = writeback(ctxt, &ctxt->dst);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
if (ctxt->d & SrcWrite) {
BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
rc = writeback(ctxt, &ctxt->src);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
/*
* restore dst type in case the decoding will be reused
@@ -4761,6 +4823,7 @@ writeback:
}
goto done; /* skip rip writeback */
}
+ ctxt->eflags &= ~EFLG_RF;
}
ctxt->eip = ctxt->_eip;
@@ -4793,8 +4856,10 @@ twobyte_insn:
ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
break;
case 0x40 ... 0x4f: /* cmov */
- ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
- if (!test_cc(ctxt->b, ctxt->eflags))
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
+ ctxt->op_bytes != 4)
ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
@@ -4818,8 +4883,8 @@ twobyte_insn:
break;
case 0xc3: /* movnti */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
- (u64) ctxt->src.val;
+ ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
+ (u32) ctxt->src.val;
break;
default:
goto cannot_emulate;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index bd0da433e6d7..a1ec6a50a05a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
vector = kvm_cpu_get_extint(v);
- if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 006911858174..08e8a899e005 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -352,25 +352,46 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ /* try to update RVI */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else {
+ vec = apic_search_irr(apic);
+ apic->irr_pending = (vec != -1);
+ }
}
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- /* Note that we never get here with APIC virtualization enabled. */
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
- if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
- ++apic->isr_count;
- BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
/*
- * ISR (in service register) bit is set when injecting an interrupt.
- * The highest vector is injected. Thus the latest bit set matches
- * the highest bit in ISR.
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
*/
- apic->highest_isr_cache = vec;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
}
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -1451,7 +1472,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
@@ -1627,11 +1648,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
- /* Note that we never get here with APIC virtualization enabled. */
-
if (vector == -1)
return -1;
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1895,7 +1921,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- pr_debug("vcpu %d received sipi with vector # %x\n",
+ apic_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, sipi_vector);
kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1185fe7a7f47..9ade5cfb5a4c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
int ret;
unsigned long enable;
- ret = strict_strtoul(val, 10, &enable);
+ ret = kstrtoul(val, 10, &enable);
if (ret < 0)
return -EINVAL;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9d2e0ffcb190..5aaf35641768 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -22,7 +22,7 @@
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
- const char *ret = p->buffer + p->len; \
+ const u32 saved_len = p->len; \
static const char *access_str[] = { \
"---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
}; \
@@ -41,7 +41,7 @@
role.nxe ? "" : "!", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
- ret; \
+ p->buffer + saved_len; \
})
#define kvm_mmu_trace_pferr_flags \
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cbecaa90399c..3dd6accb64ec 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -428,6 +428,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ bool fixed = pmc & (1u << 30);
+ pmc &= ~(3u << 30);
+ return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
+ (fixed && pmc >= pmu->nr_arch_fixed_counters);
+}
+
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
{
struct kvm_pmu *pmu = &vcpu->arch.pmu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b5e994ad0135..ddf742768ecf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
}
static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -1415,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
/*
* AMD's VMCB does not have an explicit unusable field, so emulate it
@@ -1424,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->unusable = !var->present || (var->type == 0);
switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
case VCPU_SREG_TR:
/*
* Work around a bug where the busy flag in the tr selector
@@ -2116,22 +2117,27 @@ static void nested_svm_unmap(struct page *page)
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
- unsigned port;
- u8 val, bit;
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
u64 gpa;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -4205,7 +4211,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
if (info->intercept == x86_intercept_cr_write)
icpt_info.exit_code += info->modrm_reg;
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
break;
intercept = svm->nested.intercept;
@@ -4250,14 +4257,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
u64 exit_info;
u32 bytes;
- exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
-
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
- exit_info |= SVM_IOIO_TYPE_MASK;
- bytes = info->src_bytes;
- } else {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
}
if (info->intercept == x86_intercept_outs ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 33574c95220d..e850a7d332be 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -721,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt._eip
- - vcpu->arch.emulate_ctxt.fetch.start;
+ __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
+ - vcpu->arch.emulate_ctxt.fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
memcpy(__entry->insn,
vcpu->arch.emulate_ctxt.fetch.data,
15);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 801332edefc3..bfe11cf124a1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -383,6 +383,9 @@ struct nested_vmx {
struct hrtimer preemption_timer;
bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
};
#define POSTED_INTR_ON 0
@@ -740,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static bool vmx_mpx_supported(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -820,7 +822,6 @@ static const u32 vmx_msr_index[] = {
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
static inline bool is_page_fault(u32 intr_info)
{
@@ -1940,7 +1941,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
-static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
@@ -1950,7 +1951,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ return ret;
}
static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -2239,10 +2240,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* or other means.
*/
static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_true_procbased_ctls_low;
static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_true_exit_ctls_low;
static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_true_entry_ctls_low;
static u32 nested_vmx_misc_low, nested_vmx_misc_high;
static u32 nested_vmx_ept_caps;
static __init void nested_vmx_setup_ctls_msrs(void)
@@ -2265,21 +2269,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- /*
- * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
- * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
- */
nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- /*
- * Exit controls
- * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
- * 17 must be 1.
- */
+ /* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -2296,10 +2292,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
if (vmx_mpx_supported())
nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ /* We support free control of debug control saving. */
+ nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
@@ -2311,10 +2310,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
if (vmx_mpx_supported())
nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ /* We support free control of debug control loading. */
+ nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = 0;
+ nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
@@ -2335,7 +2338,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+ nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -2394,7 +2402,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- *pdata = VMCS12_REVISION |
+ *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
break;
@@ -2404,16 +2412,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high);
+ break;
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high);
+ break;
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high);
+ break;
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
nested_vmx_entry_ctls_high);
@@ -2442,7 +2459,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = -1ULL;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x1f;
+ *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
@@ -3653,7 +3670,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
out:
- vmx->emulation_required |= emulation_required(vcpu);
+ vmx->emulation_required = emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -4422,7 +4439,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx->vcpu.arch.pat = host_pat;
}
- for (i = 0; i < NR_VMX_MSR; ++i) {
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
int j = vmx->nmsrs;
@@ -4873,7 +4890,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
if (!(dr6 & ~DR6_RESERVED)) /* icebp */
skip_emulated_instruction(vcpu);
@@ -5039,7 +5056,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- val = kvm_register_read(vcpu, reg);
+ val = kvm_register_readl(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
@@ -5056,7 +5073,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
- u8 cr8 = kvm_register_read(vcpu, reg);
+ u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
@@ -5132,7 +5149,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 0;
} else {
vcpu->arch.dr7 &= ~DR7_GD;
- vcpu->arch.dr6 |= DR6_BD;
+ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
@@ -5165,7 +5182,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 1;
kvm_register_write(vcpu, reg, val);
} else
- if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
+ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
return 1;
skip_emulated_instruction(vcpu);
@@ -5621,7 +5638,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu) && count-- != 0) {
+ while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
@@ -5655,7 +5672,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = emulation_required(vcpu);
out:
return ret;
}
@@ -5754,22 +5770,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
/*
* Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
*/
static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
{
struct vmcs02_list *item, *n;
+
+ WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- if (vmx->loaded_vmcs != &item->vmcs02)
- free_loaded_vmcs(&item->vmcs02);
+ /*
+ * Something will leak if the above WARN triggers. Better than
+ * a use-after-free.
+ */
+ if (vmx->loaded_vmcs == &item->vmcs02)
+ continue;
+
+ free_loaded_vmcs(&item->vmcs02);
list_del(&item->list);
kfree(item);
+ vmx->nested.vmcs02_num--;
}
- vmx->nested.vmcs02_num = 0;
-
- if (vmx->loaded_vmcs != &vmx->vmcs01)
- free_loaded_vmcs(&vmx->vmcs01);
}
/*
@@ -5918,7 +5939,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
* which replaces physical address width with 32
*
*/
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failInvalid(vcpu);
skip_emulated_instruction(vcpu);
return 1;
@@ -5936,7 +5957,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
vmx->nested.vmxon_ptr = vmptr;
break;
case EXIT_REASON_VMCLEAR:
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
skip_emulated_instruction(vcpu);
@@ -5951,7 +5972,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
}
break;
case EXIT_REASON_VMPTRLD:
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
skip_emulated_instruction(vcpu);
@@ -6086,20 +6107,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
u32 exec_control;
+ if (vmx->nested.current_vmptr == -1ull)
+ return;
+
+ /* current_vmptr and current_vmcs12 are always set/reset together */
+ if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
+ return;
+
if (enable_shadow_vmcs) {
- if (vmx->nested.current_vmcs12 != NULL) {
- /* copy to memory all shadowed fields in case
- they were modified */
- copy_shadow_to_vmcs12(vmx);
- vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
- }
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx->nested.sync_shadow_vmcs = false;
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -6110,12 +6138,9 @@ static void free_nested(struct vcpu_vmx *vmx)
{
if (!vmx->nested.vmxon)
return;
+
vmx->nested.vmxon = false;
- if (vmx->nested.current_vmptr != -1ull) {
- nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
+ nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
/* Unpin physical memory we referred to in current vmcs02 */
@@ -6152,11 +6177,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
return 1;
- if (vmptr == vmx->nested.current_vmptr) {
+ if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
page = nested_get_page(vcpu, vmptr);
if (page == NULL) {
@@ -6384,7 +6406,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return 1;
/* Decode instruction info and find the field to read */
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
if (!vmcs12_read_any(vcpu, field, &field_value)) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -6397,7 +6419,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
if (vmx_instruction_info & (1u << 10)) {
- kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
field_value);
} else {
if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -6434,21 +6456,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_read(vcpu,
+ field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
else {
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+ &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
}
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
if (vmcs_field_readonly(field)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
@@ -6498,9 +6520,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
- if (vmx->nested.current_vmptr != -1ull)
- nested_release_vmcs12(vmx);
+ nested_release_vmcs12(vmx);
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
@@ -6571,7 +6592,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
@@ -6751,7 +6772,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_read(vcpu, reg);
+ unsigned long val = kvm_register_readl(vcpu, reg);
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
@@ -7112,7 +7133,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
if (max_irr == -1)
return;
- vmx_set_rvi(max_irr);
+ /*
+ * If a vmexit is needed, vmx_check_nested_events handles it.
+ */
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return;
+
+ if (!is_guest_mode(vcpu)) {
+ vmx_set_rvi(max_irr);
+ return;
+ }
+
+ /*
+ * Fall back to pre-APICv interrupt injection since L2
+ * is run without virtual interrupt delivery.
+ */
+ if (!kvm_event_needs_reinjection(vcpu) &&
+ vmx_interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, max_irr, false);
+ vmx_inject_irq(vcpu);
+ }
}
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -7520,13 +7560,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_complete_interrupts(vmx);
}
+static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (vmx->loaded_vmcs == &vmx->vmcs01)
+ return;
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+}
+
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- free_loaded_vmcs(vmx->loaded_vmcs);
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
free_nested(vmx);
+ free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7548,6 +7606,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
+ > PAGE_SIZE);
+
err = -ENOMEM;
if (!vmx->guest_msrs) {
goto uninit_vcpu;
@@ -7836,7 +7897,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -7846,7 +7913,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
- kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs12->guest_pending_dbg_exceptions);
@@ -8113,14 +8179,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->apic_access_addr)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8136,15 +8202,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+ nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+ nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+ nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8221,6 +8290,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
cpu = get_cpu();
vmx->loaded_vmcs = vmcs02;
vmx_vcpu_put(vcpu);
@@ -8398,7 +8470,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
@@ -8477,9 +8548,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
* the relevant bit asks not to trap the change */
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
@@ -8670,7 +8745,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
unsigned long exit_qualification)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* trying to cancel vmlaunch/vmresume is a bug */
@@ -8680,6 +8754,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ vmx_load_vmcs01(vcpu);
+
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
&& nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
@@ -8695,13 +8771,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- cpu = get_cpu();
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
vmx_segment_cache_clear(vmx);
@@ -8890,7 +8959,7 @@ static int __init vmx_init(void)
rdmsrl_safe(MSR_EFER, &host_efer);
- for (i = 0; i < NR_VMX_MSR; ++i)
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ef432f891d30..8f1e22d3b286 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -211,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr)
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
+ BUG_ON(slot >= KVM_NR_SHARED_MSRS);
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
shared_msrs_global.msrs[slot] = msr;
@@ -310,6 +312,31 @@ static int exception_class(int vector)
return EXCPT_BENIGN;
}
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /* #DB is trap, as instruction watchpoints are handled elsewhere */
+ if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool reinject)
@@ -758,6 +785,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has_rtm(vcpu))
+ fixed |= DR6_RTM;
+ return fixed;
+}
+
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
@@ -773,7 +809,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 6:
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
kvm_update_dr6(vcpu);
break;
case 5:
@@ -984,9 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
- /* open coded 'struct timespec' */
- u64 monotonic_time_snsec;
- time_t monotonic_time_sec;
+ u64 boot_ns;
+ u64 nsec_base;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -994,27 +1029,21 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+ u64 boot_ns;
+
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->clock->cycle_last;
- vdata->clock.mask = tk->clock->mask;
- vdata->clock.mult = tk->mult;
- vdata->clock.shift = tk->shift;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
- + (tk->wall_to_monotonic.tv_nsec
- << tk->shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
- vdata->monotonic_time_sec++;
- }
+ vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr.cycle_last;
+ vdata->clock.mask = tk->tkr.mask;
+ vdata->clock.mult = tk->tkr.mult;
+ vdata->clock.shift = tk->tkr.shift;
+
+ vdata->boot_ns = boot_ns;
+ vdata->nsec_base = tk->tkr.xtime_nsec;
write_seqcount_end(&vdata->seq);
}
@@ -1109,11 +1138,7 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
static inline u64 get_kernel_ns(void)
{
- struct timespec ts;
-
- ktime_get_ts(&ts);
- monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
+ return ktime_get_boot_ns();
}
#ifdef CONFIG_X86_64
@@ -1215,6 +1240,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
unsigned long flags;
s64 usdiff;
bool matched;
+ bool already_matched;
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -1279,6 +1305,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
+ already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
} else {
/*
* We split periods of matched TSC writes into generations.
@@ -1294,7 +1321,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
matched = false;
- pr_debug("kvm: new tsc generation %u, clock %llu\n",
+ pr_debug("kvm: new tsc generation %llu, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
@@ -1319,10 +1346,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
- if (matched)
- kvm->arch.nr_vcpus_matched_tsc++;
- else
+ if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (!already_matched) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
kvm_track_tsc_matching(vcpu);
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1375,23 +1403,22 @@ static inline u64 vgettsc(cycle_t *cycle_now)
return v * gtod->clock.mult;
}
-static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
- u64 ns;
int mode;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 ns;
- ts->tv_nsec = 0;
do {
seq = read_seqcount_begin(&gtod->seq);
mode = gtod->clock.vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
+ ns = gtod->nsec_base;
ns += vgettsc(cycle_now);
ns >>= gtod->clock.shift;
+ ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
- timespec_add_ns(ts, ns);
+ *t = ns;
return mode;
}
@@ -1399,19 +1426,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
{
- struct timespec ts;
-
/* checked again under seqlock below */
if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
return false;
- if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
- return false;
-
- monotonic_to_bootbased(&ts);
- *kernel_ns = timespec_to_ns(&ts);
-
- return true;
+ return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
}
#endif
@@ -2032,6 +2051,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
+ data &= ~(u64)0x40000; /* ignore Mc status write enable */
if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -2616,7 +2636,7 @@ out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -2974,9 +2994,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+ events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
@@ -4082,7 +4100,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4103,10 +4122,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == UNMAPPED_GVA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
}
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
@@ -4730,7 +4763,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
var.type = desc->type;
- var.present = desc->p;
var.dpl = desc->dpl;
var.db = desc->d;
var.s = desc->s;
@@ -4762,6 +4794,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+}
+
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
@@ -4838,6 +4876,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -4850,7 +4889,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -4858,8 +4897,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
- if (!(int_shadow & mask))
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
}
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
@@ -4874,19 +4918,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
-{
- memset(&ctxt->opcode_len, 0,
- (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
-
- ctxt->fetch.start = 0;
- ctxt->fetch.end = 0;
- ctxt->io_read.pos = 0;
- ctxt->io_read.end = 0;
- ctxt->mem_read.pos = 0;
- ctxt->mem_read.end = 0;
-}
-
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -5085,23 +5116,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
/*
- * Use the "raw" value to see if TF was passed to the processor.
- * Note that the new value of the flags has not been saved yet.
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
*
* This is correct even for TF set by the guest, because "the
* processor will not generate this exception after the instruction
* that sets the TF flag".
*/
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
-
if (unlikely(rflags & X86_EFLAGS_TF)) {
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
+ DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -5114,7 +5144,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5133,7 +5163,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
get_segment_base(vcpu, VCPU_SREG_CS);
@@ -5144,14 +5174,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
}
}
- if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
if (dr6 != 0) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
*r = EMULATE_DONE;
return true;
@@ -5215,6 +5246,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return EMULATE_DONE;
}
@@ -5265,13 +5298,22 @@ restart:
r = EMULATE_DONE;
if (writeback) {
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, &r);
- kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5662,7 +5704,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
u64 param, ingpa, outgpa, ret;
uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
bool fast, longmode;
- int cs_db, cs_l;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -5673,8 +5714,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- longmode = is_long_mode(vcpu) && cs_l == 1;
+ longmode = is_64_bit_mode(vcpu);
if (!longmode) {
param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
@@ -5739,7 +5779,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
- int r = 1;
+ int op_64_bit, r = 1;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -5752,7 +5792,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- if (!is_long_mode(vcpu)) {
+ op_64_bit = is_64_bit_mode(vcpu);
+ if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
@@ -5778,6 +5819,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
}
out:
+ if (!op_64_bit)
+ ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
@@ -5856,6 +5899,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+
+ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
@@ -6847,9 +6895,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = DR6_FIXED_1;
+ vcpu->arch.dr6 = DR6_INIT;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -7405,12 +7455,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8c97bac9a895..306a1b77581f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
@@ -108,6 +118,23 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write(vcpu, reg, val);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 167ffcac16ed..95a427e57887 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -48,7 +48,9 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+# ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
+# endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -71,7 +73,9 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+# endif
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 36642793e315..a24194681513 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (read_cr4() & X86_CR4_SMEP))
+ printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@ -1212,7 +1218,8 @@ good_area:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault:
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f97130618113..66dba36f2343 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,13 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
#include "mm_internal.h"
static unsigned long __initdata pgt_buf_start;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e39504878aec..7d05565ba781 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,7 +825,8 @@ void __init mem_init(void)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
- struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+ struct zone *zone = pgdata->node_zones +
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df1a9927ad29..5621c47d7a1a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+ struct zone *zone = pgdat->node_zones +
+ zone_for_memory(nid, start, size, ZONE_NORMAL);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 25e7e1372bb2..919b91205cd4 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,7 +31,7 @@
#include <linux/sched.h>
#include <asm/elf.h>
-struct __read_mostly va_alignment va_align = {
+struct va_alignment __read_mostly va_align = {
.flags = -1,
};
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dd8dda167a24..ee61c36d64f8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,6 +49,13 @@ void leave_mm(int cpu)
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
+ /*
+ * This gets called in the idle path where RCU
+ * functions differently. Tracing normally
+ * uses RCU, so we have to call the tracepoint
+ * specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
}
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -102,20 +109,24 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
+ if (!f->flush_end)
+ f->flush_end = f->flush_start + PAGE_SIZE;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL)
+ if (f->flush_end == TLB_FLUSH_ALL) {
local_flush_tlb();
- else if (!f->flush_end)
- __flush_tlb_single(f->flush_start);
- else {
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
unsigned long addr;
+ unsigned long nr_pages =
+ f->flush_end - f->flush_start / PAGE_SIZE;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
}
} else
leave_mm(smp_processor_id());
@@ -153,46 +164,45 @@ void flush_tlb_current_task(void)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
+/*
+ * See Documentation/x86/tlb.txt for details. We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead. Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
- unsigned act_entries, tlb_entries = 0;
- unsigned long nr_base_pages;
+ /* do a global flush by default */
+ unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
if (current->active_mm != mm)
- goto flush_all;
+ goto out;
if (!current->mm) {
leave_mm(smp_processor_id());
- goto flush_all;
+ goto out;
}
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
- || vmflag & VM_HUGETLB) {
- local_flush_tlb();
- goto flush_all;
- }
-
- /* In modern CPU, last level tlb used for both data/ins */
- if (vmflag & VM_EXEC)
- tlb_entries = tlb_lli_4k[ENTRIES];
- else
- tlb_entries = tlb_lld_4k[ENTRIES];
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- /* Assume all of TLB entries was occupied by this task */
- act_entries = tlb_entries >> tlb_flushall_shift;
- act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
- nr_base_pages = (end - start) >> PAGE_SHIFT;
-
- /* tlb_flushall_shift is on balance point, details in commit log */
- if (nr_base_pages > act_entries) {
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+ base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@@ -201,17 +211,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
-
- if (cpumask_any_but(mm_cpumask(mm),
- smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
- return;
}
-
-flush_all:
+ trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
+out:
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
+ start = 0UL;
+ end = TLB_FLUSH_ALL;
+ }
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
preempt_enable();
}
@@ -260,32 +268,26 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- unsigned act_entries;
- struct flush_tlb_info info;
-
- /* In modern CPU, last level tlb used for both data/ins */
- act_entries = tlb_lld_4k[ENTRIES];
/* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
- (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
-
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
- else {
+ } else {
+ struct flush_tlb_info info;
info.flush_start = start;
info.flush_end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
-#ifdef CONFIG_DEBUG_TLBFLUSH
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
- len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+ len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
@@ -294,20 +296,20 @@ static ssize_t tlbflush_write_file(struct file *file,
{
char buf[32];
ssize_t len;
- s8 shift;
+ int ceiling;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
- if (kstrtos8(buf, 0, &shift))
+ if (kstrtoint(buf, 0, &ceiling))
return -EINVAL;
- if (shift < -1 || shift >= BITS_PER_LONG)
+ if (ceiling < 0)
return -EINVAL;
- tlb_flushall_shift = shift;
+ tlb_single_page_flush_ceiling = ceiling;
return count;
}
@@ -317,11 +319,10 @@ static const struct file_operations fops_tlbflush = {
.llseek = default_llseek,
};
-static int __init create_tlb_flushall_shift(void)
+static int __init create_tlb_single_page_flush_ceiling(void)
{
- debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+ debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlbflush);
return 0;
}
-late_initcall(create_tlb_flushall_shift);
-#endif
+late_initcall(create_tlb_single_page_flush_ceiling);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 99bef86ed6df..5c8cb8043c5a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -211,10 +211,10 @@ struct jit_context {
bool seen_ld_abs;
};
-static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
- struct sock_filter_int *insn = bpf_prog->insnsi;
+ struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
u8 temp[64];
int i;
@@ -235,7 +235,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
/* mov qword ptr [rbp-X],rbx */
EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
- /* sk_convert_filter() maps classic BPF register X to R7 and uses R8
+ /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
* R8(r14). R9(r15) spill could be made conditional, but there is only
* one 'bpf_error' return path out of helper functions inside bpf_jit.S
@@ -841,7 +841,7 @@ common_load: ctx->seen_ld_abs = true;
/* By design x64 JIT should support all BPF instructions
* This error will be seen if new instruction was added
* to interpreter, but not to JIT
- * or if there is junk in sk_filter
+ * or if there is junk in bpf_prog
*/
pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
return -EINVAL;
@@ -862,11 +862,11 @@ common_load: ctx->seen_ld_abs = true;
return proglen;
}
-void bpf_jit_compile(struct sk_filter *prog)
+void bpf_jit_compile(struct bpf_prog *prog)
{
}
-void bpf_int_jit_compile(struct sk_filter *prog)
+void bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
int proglen, oldproglen = 0;
@@ -932,7 +932,7 @@ out:
static void bpf_jit_free_deferred(struct work_struct *work)
{
- struct sk_filter *fp = container_of(work, struct sk_filter, work);
+ struct bpf_prog *fp = container_of(work, struct bpf_prog, work);
unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;
@@ -941,7 +941,7 @@ static void bpf_jit_free_deferred(struct work_struct *work)
kfree(fp);
}
-void bpf_jit_free(struct sk_filter *fp)
+void bpf_jit_free(struct bpf_prog *fp)
{
if (fp->jited) {
INIT_WORK(&fp->work, bpf_jit_free_deferred);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 5075371ab593..cfd1b132b8e3 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
return;
size = sizeof(*info->res) * info->res_num;
- info->res = kzalloc(size, GFP_KERNEL);
+ info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
if (!info->res) {
info->res_num = 0;
return;
@@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
size = sizeof(*info->res_offset) * info->res_num;
info->res_num = 0;
- info->res_offset = kzalloc(size, GFP_KERNEL);
+ info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
if (!info->res_offset) {
kfree(info->res);
info->res = NULL;
@@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
if (node != NUMA_NO_NODE && !node_online(node))
node = NUMA_NO_NODE;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
if (!info) {
printk(KERN_WARNING "pci_bus %04x:%02x: "
"ignored (out of memory)\n", domain, busnum);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b5e60268d93f..9a2b7101ae8a 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -350,8 +350,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
pci_read_config_word(pdev, PCI_COMMAND, &config);
if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
- dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
- vga_set_default_device(pdev);
+ dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
}
}
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a19ed92e74e4..2ae525e0d8ba 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res,
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
}
return start;
}
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 84b9d672843d..b9958c364075 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
- u8 pin;
- struct io_apic_irq_attr irq_attr;
+ int polarity;
- pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+ polarity = 0; /* active high */
+ else
+ polarity = 1; /* active low */
/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
* IOAPIC RTE entries, so we just enable RTE for the device.
*/
- irq_attr.ioapic = mp_find_ioapic(dev->irq);
- irq_attr.ioapic_pin = dev->irq;
- irq_attr.trigger = 1; /* level */
- if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
- irq_attr.polarity = 0; /* active high */
- else
- irq_attr.polarity = 1; /* active low */
- io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
+ if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
+ return -EBUSY;
+ if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+ return -EBUSY;
return 0;
}
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq > 0)
+ mp_unmap_irq(dev->irq);
+}
+
struct pci_ops intel_mid_pci_ops = {
.read = pci_read,
.write = pci_write,
@@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void)
pr_info("Intel MID platform detected, using MID PCI ops\n");
pci_mmcfg_late_init();
pcibios_enable_irq = intel_mid_pci_irq_enable;
+ pcibios_disable_irq = intel_mid_pci_irq_disable;
pci_root_ops = intel_mid_pci_ops;
pci_soc_mode = 1;
/* Continue with standard init */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 84112f55dd7a..eb500c2592ad 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -26,6 +26,7 @@ static int acer_tm360_irqrouting;
static struct irq_routing_table *pirq_table;
static int pirq_enable_irq(struct pci_dev *dev);
+static void pirq_disable_irq(struct pci_dev *dev);
/*
* Never use: 0, 1, 2 (timer, keyboard, and cascade)
@@ -53,7 +54,7 @@ struct irq_router_handler {
};
int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
-void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
/*
* Check passed address for the PCI IRQ Routing Table signature
@@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active)
static int pirq_enable_irq(struct pci_dev *dev)
{
- u8 pin;
+ u8 pin = 0;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (pin && !pcibios_lookup_irq(dev, 1)) {
@@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
- io_apic_set_pci_routing(&dev->dev, irq,
- &irq_attr);
dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
return 0;
}
+
+static void pirq_disable_irq(struct pci_dev *dev)
+{
+ if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+ dev->irq) {
+ mp_unmap_irq(dev->irq);
+ dev->irq = 0;
+ }
+}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 905956f16465..093f5f4272d3 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,7 @@
#include <xen/features.h>
#include <xen/events.h>
#include <asm/xen/pci.h>
+#include <asm/i8259.h>
static int xen_pcifront_enable_irq(struct pci_dev *dev)
{
@@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
/* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
pirq = gsi;
- if (gsi < NR_IRQS_LEGACY)
+ if (gsi < nr_legacy_irqs())
share = 0;
rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
@@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void)
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
/* Pre-allocate legacy irqs */
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ for (irq = 0; irq < nr_legacy_irqs(); irq++) {
int trigger, polarity;
if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
@@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void)
true /* Map GSI to PIRQ */);
}
if (0 == nr_ioapics) {
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
}
return 0;
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index 8244f5ec2f4c..701fd5843c87 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void)
sdv_serial_fixup();
}
-#ifdef CONFIG_X86_IO_APIC
static void sdv_pci_init(void)
{
x86_of_pci_init();
- /* We can't set this earlier, because we need to calibrate the timer */
- legacy_pic = &null_legacy_pic;
}
-#endif
/*
* CE4100 specific x86_init function overrides and early setup
@@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void)
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
x86_init.pci.init = ce4100_pci_init;
+ x86_init.pci.init_irq = sdv_pci_init;
/*
* By default, the reboot method is ACPI which is supported by the
@@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void)
*/
reboot_type = BOOT_KBD;
-#ifdef CONFIG_X86_IO_APIC
- x86_init.pci.init_irq = sdv_pci_init;
- x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
-#endif
-
pm_power_off = ce4100_power_off;
}
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index d51045afcaaf..2846aaab5103 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 87fc96bcc13c..850da94fef30 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -56,13 +56,6 @@
#define EFI_DEBUG
-#define EFI_MIN_RESERVE 5120
-
-#define EFI_DUMMY_GUID \
- EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
-
-static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
-
struct efi_memory_map memmap;
static struct efi efi_phys __initdata;
@@ -95,139 +88,6 @@ static int __init setup_add_efi_memmap(char *arg)
}
early_param("add_efi_memmap", setup_add_efi_memmap);
-static bool efi_no_storage_paranoia;
-
-static int __init setup_storage_paranoia(char *arg)
-{
- efi_no_storage_paranoia = true;
- return 0;
-}
-early_param("efi_no_storage_paranoia", setup_storage_paranoia);
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(get_time, tm, tc);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(set_time, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
- efi_bool_t *pending,
- efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(get_wakeup_time, enabled, pending, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(set_wakeup_time, enabled, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 *attr,
- unsigned long *data_size,
- void *data)
-{
- return efi_call_virt(get_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
- efi_char16_t *name,
- efi_guid_t *vendor)
-{
- return efi_call_virt(get_next_variable,
- name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 attr,
- unsigned long data_size,
- void *data)
-{
- return efi_call_virt(set_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_query_variable_info(u32 attr,
- u64 *storage_space,
- u64 *remaining_space,
- u64 *max_variable_size)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(query_variable_info, attr, storage_space,
- remaining_space, max_variable_size);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
- return efi_call_virt(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
- efi_status_t status,
- unsigned long data_size,
- efi_char16_t *data)
-{
- __efi_call_virt(reset_system, reset_type, status,
- data_size, data);
-}
-
-static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
- unsigned long count,
- unsigned long sg_list)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(update_capsule, capsules, count, sg_list);
-}
-
-static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
- unsigned long count,
- u64 *max_size,
- int *reset_type)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(query_capsule_caps, capsules, count, max_size,
- reset_type);
-}
-
static efi_status_t __init phys_efi_set_virtual_address_map(
unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -244,42 +104,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
return status;
}
-int efi_set_rtc_mmss(const struct timespec *now)
-{
- unsigned long nowtime = now->tv_sec;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
- struct rtc_time tm;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't read time!\n");
- return -1;
- }
-
- rtc_time_to_tm(nowtime, &tm);
- if (!rtc_valid_tm(&tm)) {
- eft.year = tm.tm_year + 1900;
- eft.month = tm.tm_mon + 1;
- eft.day = tm.tm_mday;
- eft.minute = tm.tm_min;
- eft.second = tm.tm_sec;
- eft.nanosecond = 0;
- } else {
- pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
- __func__, nowtime);
- return -1;
- }
-
- status = efi.set_time(&eft);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't write time!\n");
- return -1;
- }
- return 0;
-}
-
void efi_get_time(struct timespec *now)
{
efi_status_t status;
@@ -350,6 +174,9 @@ int __init efi_memblock_x86_reserve_range(void)
struct efi_info *e = &boot_params.efi_info;
unsigned long pmap;
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
#ifdef CONFIG_X86_32
/* Can't handle data above 4GB at this time */
if (e->efi_memmap_hi) {
@@ -392,69 +219,15 @@ static void __init print_efi_memmap(void)
#endif /* EFI_DEBUG */
}
-void __init efi_reserve_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- u64 start = md->phys_addr;
- u64 size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
- /* Only reserve where possible:
- * - Not within any already allocated areas
- * - Not over any memory area (really needed, if above?)
- * - Not within any part of the kernel
- * - Not the bios reserved area
- */
- if ((start + size > __pa_symbol(_text)
- && start <= __pa_symbol(_end)) ||
- !e820_all_mapped(start, start+size, E820_RAM) ||
- memblock_is_region_reserved(start, size)) {
- /* Could not reserve, skip it */
- md->num_pages = 0;
- memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
- start, start+size-1);
- } else
- memblock_reserve(start, size);
- }
-}
-
void __init efi_unmap_memmap(void)
{
clear_bit(EFI_MEMMAP, &efi.flags);
if (memmap.map) {
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+ early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
}
}
-void __init efi_free_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- unsigned long long start = md->phys_addr;
- unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
-
- /* Could not reserve boot area */
- if (!size)
- continue;
-
- free_bootmem_late(start, size);
- }
-
- efi_unmap_memmap();
-}
-
static int __init efi_systab_init(void *phys)
{
if (efi_enabled(EFI_64BIT)) {
@@ -467,12 +240,12 @@ static int __init efi_systab_init(void *phys)
if (!data)
return -ENOMEM;
}
- systab64 = early_ioremap((unsigned long)phys,
+ systab64 = early_memremap((unsigned long)phys,
sizeof(*systab64));
if (systab64 == NULL) {
pr_err("Couldn't map the system table!\n");
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
return -ENOMEM;
}
@@ -504,9 +277,9 @@ static int __init efi_systab_init(void *phys)
systab64->tables;
tmp |= data ? data->tables : systab64->tables;
- early_iounmap(systab64, sizeof(*systab64));
+ early_memunmap(systab64, sizeof(*systab64));
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
#ifdef CONFIG_X86_32
if (tmp >> 32) {
pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -516,7 +289,7 @@ static int __init efi_systab_init(void *phys)
} else {
efi_system_table_32_t *systab32;
- systab32 = early_ioremap((unsigned long)phys,
+ systab32 = early_memremap((unsigned long)phys,
sizeof(*systab32));
if (systab32 == NULL) {
pr_err("Couldn't map the system table!\n");
@@ -537,7 +310,7 @@ static int __init efi_systab_init(void *phys)
efi_systab.nr_tables = systab32->nr_tables;
efi_systab.tables = systab32->tables;
- early_iounmap(systab32, sizeof(*systab32));
+ early_memunmap(systab32, sizeof(*systab32));
}
efi.systab = &efi_systab;
@@ -563,7 +336,7 @@ static int __init efi_runtime_init32(void)
{
efi_runtime_services_32_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_32_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -578,7 +351,7 @@ static int __init efi_runtime_init32(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_32_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_32_t));
return 0;
}
@@ -587,7 +360,7 @@ static int __init efi_runtime_init64(void)
{
efi_runtime_services_64_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_64_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -602,7 +375,7 @@ static int __init efi_runtime_init64(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_64_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_64_t));
return 0;
}
@@ -616,14 +389,24 @@ static int __init efi_runtime_init(void)
* the runtime services table so that we can grab the physical
* address of several of the EFI runtime functions, needed to
* set the firmware into virtual mode.
+ *
+ * When EFI_PARAVIRT is in force then we could not map runtime
+ * service memory region because we do not have direct access to it.
+ * However, runtime services are available through proxy functions
+ * (e.g. in case of Xen dom0 EFI implementation they call special
+ * hypercall which executes relevant EFI functions) and that is why
+ * they are always enabled.
*/
- if (efi_enabled(EFI_64BIT))
- rv = efi_runtime_init64();
- else
- rv = efi_runtime_init32();
- if (rv)
- return rv;
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ if (efi_enabled(EFI_64BIT))
+ rv = efi_runtime_init64();
+ else
+ rv = efi_runtime_init32();
+
+ if (rv)
+ return rv;
+ }
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
@@ -632,8 +415,11 @@ static int __init efi_runtime_init(void)
static int __init efi_memmap_init(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
/* Map the EFI memory map */
- memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+ memmap.map = early_memremap((unsigned long)memmap.phys_map,
memmap.nr_map * memmap.desc_size);
if (memmap.map == NULL) {
pr_err("Could not map the memory map!\n");
@@ -649,62 +435,6 @@ static int __init efi_memmap_init(void)
return 0;
}
-/*
- * A number of config table entries get remapped to virtual addresses
- * after entering EFI virtual mode. However, the kexec kernel requires
- * their physical addresses therefore we pass them via setup_data and
- * correct those entries to their respective physical addresses here.
- *
- * Currently only handles smbios which is necessary for some firmware
- * implementation.
- */
-static int __init efi_reuse_config(u64 tables, int nr_tables)
-{
- int i, sz, ret = 0;
- void *p, *tablep;
- struct efi_setup_data *data;
-
- if (!efi_setup)
- return 0;
-
- if (!efi_enabled(EFI_64BIT))
- return 0;
-
- data = early_memremap(efi_setup, sizeof(*data));
- if (!data) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (!data->smbios)
- goto out_memremap;
-
- sz = sizeof(efi_config_table_64_t);
-
- p = tablep = early_memremap(tables, nr_tables * sz);
- if (!p) {
- pr_err("Could not map Configuration table!\n");
- ret = -ENOMEM;
- goto out_memremap;
- }
-
- for (i = 0; i < efi.systab->nr_tables; i++) {
- efi_guid_t guid;
-
- guid = ((efi_config_table_64_t *)p)->guid;
-
- if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
- ((efi_config_table_64_t *)p)->table = data->smbios;
- p += sz;
- }
- early_iounmap(tablep, nr_tables * sz);
-
-out_memremap:
- early_iounmap(data, sizeof(*data));
-out:
- return ret;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
@@ -728,8 +458,6 @@ void __init efi_init(void)
if (efi_systab_init(efi_phys.systab))
return;
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
efi.config_table = (unsigned long)efi.systab->tables;
efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
efi.runtime = (unsigned long)efi.systab->runtime;
@@ -737,14 +465,14 @@ void __init efi_init(void)
/*
* Show what we know for posterity
*/
- c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+ c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
if (c16) {
for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
vendor[i] = *c16++;
vendor[i] = '\0';
} else
pr_err("Could not map the firmware vendor!\n");
- early_iounmap(tmp, 2);
+ early_memunmap(tmp, 2);
pr_info("EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
@@ -770,8 +498,6 @@ void __init efi_init(void)
if (efi_memmap_init())
return;
- set_bit(EFI_MEMMAP, &efi.flags);
-
print_efi_memmap();
}
@@ -847,22 +573,6 @@ void __init old_map_region(efi_memory_desc_t *md)
(unsigned long long)md->phys_addr);
}
-static void native_runtime_setup(void)
-{
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.query_variable_info = virt_efi_query_variable_info;
- efi.update_capsule = virt_efi_update_capsule;
- efi.query_capsule_caps = virt_efi_query_capsule_caps;
-}
-
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
@@ -1049,7 +759,7 @@ static void __init kexec_enter_virtual_mode(void)
*/
efi.runtime_version = efi_systab.hdr.revision;
- native_runtime_setup();
+ efi_native_runtime_setup();
efi.set_virtual_address_map = NULL;
@@ -1057,11 +767,7 @@ static void __init kexec_enter_virtual_mode(void)
runtime_code_page_mkexec();
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
#endif
}
@@ -1142,7 +848,7 @@ static void __init __efi_enter_virtual_mode(void)
efi.runtime_version = efi_systab.hdr.revision;
if (efi_is_native())
- native_runtime_setup();
+ efi_native_runtime_setup();
else
efi_thunk_runtime_setup();
@@ -1179,15 +885,14 @@ static void __init __efi_enter_virtual_mode(void)
free_pages((unsigned long)new_memmap, pg_shift);
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
}
void __init efi_enter_virtual_mode(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return;
+
if (efi_setup)
kexec_enter_virtual_mode();
else
@@ -1220,6 +925,9 @@ u64 efi_mem_attributes(unsigned long phys_addr)
efi_memory_desc_t *md;
void *p;
+ if (!efi_enabled(EFI_MEMMAP))
+ return 0;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
@@ -1230,86 +938,6 @@ u64 efi_mem_attributes(unsigned long phys_addr)
return 0;
}
-/*
- * Some firmware implementations refuse to boot if there's insufficient space
- * in the variable store. Ensure that we never use more than a safe limit.
- *
- * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
- * store.
- */
-efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
-{
- efi_status_t status;
- u64 storage_size, remaining_size, max_size;
-
- if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
- return 0;
-
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * We account for that by refusing the write if permitting it would
- * reduce the available space to under 5KB. This figure was provided by
- * Samsung, so should be safe.
- */
- if ((remaining_size - size < EFI_MIN_RESERVE) &&
- !efi_no_storage_paranoia) {
-
- /*
- * Triggering garbage collection may require that the firmware
- * generate a real EFI_OUT_OF_RESOURCES error. We can force
- * that by attempting to use more space than is available.
- */
- unsigned long dummy_size = remaining_size + 1024;
- void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
-
- if (!dummy)
- return EFI_OUT_OF_RESOURCES;
-
- status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- dummy_size, dummy);
-
- if (status == EFI_SUCCESS) {
- /*
- * This should have failed, so if it didn't make sure
- * that we delete it...
- */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, dummy);
- }
-
- kfree(dummy);
-
- /*
- * The runtime code may now have triggered a garbage collection
- * run, so check the variable info again
- */
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
-
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * There still isn't enough room, so return an error
- */
- if (remaining_size - size < EFI_MIN_RESERVE)
- return EFI_OUT_OF_RESOURCES;
- }
-
- return EFI_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(efi_query_variable_store);
-
static int __init parse_efi_cmdline(char *str)
{
if (*str == '=')
@@ -1321,22 +949,3 @@ static int __init parse_efi_cmdline(char *str)
return 0;
}
early_param("efi", parse_efi_cmdline);
-
-void __init efi_apply_memmap_quirks(void)
-{
- /*
- * Once setup is done earlier, unmap the EFI memory map on mismatched
- * firmware/kernel architectures since there is no support for runtime
- * services.
- */
- if (!efi_runtime_supported()) {
- pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
- }
-
- /*
- * UV doesn't support the new EFI pagetable mapping yet.
- */
- if (is_uv_system())
- set_bit(EFI_OLD_MEMMAP, &efi.flags);
-}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
new file mode 100644
index 000000000000..1c7380da65ff
--- /dev/null
+++ b/arch/x86/platform/efi/quirks.c
@@ -0,0 +1,290 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <asm/efi.h>
+#include <asm/uv/uv.h>
+
+#define EFI_MIN_RESERVE 5120
+
+#define EFI_DUMMY_GUID \
+ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+
+static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
+
+static bool efi_no_storage_paranoia;
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient
+ * space in the variable store. The implementation of garbage collection
+ * in some FW versions causes stale (deleted) variables to take up space
+ * longer than intended and space is only freed once the store becomes
+ * almost completely full.
+ *
+ * Enabling this option disables the space checks in
+ * efi_query_variable_store() and forces garbage collection.
+ *
+ * Only enable this option if deleting EFI variables does not free up
+ * space in your variable store, e.g. if despite deleting variables
+ * you're unable to create new ones.
+ */
+static int __init setup_storage_paranoia(char *arg)
+{
+ efi_no_storage_paranoia = true;
+ return 0;
+}
+early_param("efi_no_storage_paranoia", setup_storage_paranoia);
+
+/*
+ * Deleting the dummy variable which kicks off garbage collection
+*/
+void efi_delete_dummy_variable(void)
+{
+ efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ 0, NULL);
+}
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient space
+ * in the variable store. Ensure that we never use more than a safe limit.
+ *
+ * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+ * store.
+ */
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+ return 0;
+
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * We account for that by refusing the write if permitting it would
+ * reduce the available space to under 5KB. This figure was provided by
+ * Samsung, so should be safe.
+ */
+ if ((remaining_size - size < EFI_MIN_RESERVE) &&
+ !efi_no_storage_paranoia) {
+
+ /*
+ * Triggering garbage collection may require that the firmware
+ * generate a real EFI_OUT_OF_RESOURCES error. We can force
+ * that by attempting to use more space than is available.
+ */
+ unsigned long dummy_size = remaining_size + 1024;
+ void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
+
+ if (!dummy)
+ return EFI_OUT_OF_RESOURCES;
+
+ status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
+
+ if (status == EFI_SUCCESS) {
+ /*
+ * This should have failed, so if it didn't make sure
+ * that we delete it...
+ */
+ efi_delete_dummy_variable();
+ }
+
+ kfree(dummy);
+
+ /*
+ * The runtime code may now have triggered a garbage collection
+ * run, so check the variable info again
+ */
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * There still isn't enough room, so return an error
+ */
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+ }
+
+ return EFI_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+/*
+ * The UEFI specification makes it clear that the operating system is free to do
+ * whatever it wants with boot services code after ExitBootServices() has been
+ * called. Ignoring this recommendation a significant bunch of EFI implementations
+ * continue calling into boot services code (SetVirtualAddressMap). In order to
+ * work around such buggy implementations we reserve boot services region during
+ * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
+* is discarded.
+*/
+void __init efi_reserve_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+ /* Only reserve where possible:
+ * - Not within any already allocated areas
+ * - Not over any memory area (really needed, if above?)
+ * - Not within any part of the kernel
+ * - Not the bios reserved area
+ */
+ if ((start + size > __pa_symbol(_text)
+ && start <= __pa_symbol(_end)) ||
+ !e820_all_mapped(start, start+size, E820_RAM) ||
+ memblock_is_region_reserved(start, size)) {
+ /* Could not reserve, skip it */
+ md->num_pages = 0;
+ memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
+ start, start+size-1);
+ } else
+ memblock_reserve(start, size);
+ }
+}
+
+void __init efi_free_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ /* Could not reserve boot area */
+ if (!size)
+ continue;
+
+ free_bootmem_late(start, size);
+ }
+
+ efi_unmap_memmap();
+}
+
+/*
+ * A number of config table entries get remapped to virtual addresses
+ * after entering EFI virtual mode. However, the kexec kernel requires
+ * their physical addresses therefore we pass them via setup_data and
+ * correct those entries to their respective physical addresses here.
+ *
+ * Currently only handles smbios which is necessary for some firmware
+ * implementation.
+ */
+int __init efi_reuse_config(u64 tables, int nr_tables)
+{
+ int i, sz, ret = 0;
+ void *p, *tablep;
+ struct efi_setup_data *data;
+
+ if (!efi_setup)
+ return 0;
+
+ if (!efi_enabled(EFI_64BIT))
+ return 0;
+
+ data = early_memremap(efi_setup, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!data->smbios)
+ goto out_memremap;
+
+ sz = sizeof(efi_config_table_64_t);
+
+ p = tablep = early_memremap(tables, nr_tables * sz);
+ if (!p) {
+ pr_err("Could not map Configuration table!\n");
+ ret = -ENOMEM;
+ goto out_memremap;
+ }
+
+ for (i = 0; i < efi.systab->nr_tables; i++) {
+ efi_guid_t guid;
+
+ guid = ((efi_config_table_64_t *)p)->guid;
+
+ if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
+ ((efi_config_table_64_t *)p)->table = data->smbios;
+ p += sz;
+ }
+ early_memunmap(tablep, nr_tables * sz);
+
+out_memremap:
+ early_memunmap(data, sizeof(*data));
+out:
+ return ret;
+}
+
+void __init efi_apply_memmap_quirks(void)
+{
+ /*
+ * Once setup is done earlier, unmap the EFI memory map on mismatched
+ * firmware/kernel architectures since there is no support for runtime
+ * services.
+ */
+ if (!efi_runtime_supported()) {
+ pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+ efi_unmap_memmap();
+ }
+
+ /*
+ * UV doesn't support the new EFI pagetable mapping yet.
+ */
+ if (is_uv_system())
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
+}
+
+/*
+ * For most modern platforms the preferred method of powering off is via
+ * ACPI. However, there are some that are known to require the use of
+ * EFI runtime services and for which ACPI does not work at all.
+ *
+ * Using EFI is a last resort, to be used only if no other option
+ * exists.
+ */
+bool efi_reboot_required(void)
+{
+ if (!acpi_gbl_reduced_hardware)
+ return false;
+
+ efi_reboot_quirk_mode = EFI_RESET_WARM;
+ return true;
+}
+
+bool efi_poweroff_required(void)
+{
+ return !!acpi_gbl_reduced_hardware;
+}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 973cf3bfa9fd..0b283d4d0ad7 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -26,28 +26,18 @@ static struct platform_device wdt_dev = {
static int tangier_probe(struct platform_device *pdev)
{
- int ioapic;
- int irq;
+ int gsi;
struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
- struct io_apic_irq_attr irq_attr = { 0 };
if (!pdata)
return -EINVAL;
- irq = pdata->irq;
- ioapic = mp_find_ioapic(irq);
- if (ioapic >= 0) {
- int ret;
- irq_attr.ioapic = ioapic;
- irq_attr.ioapic_pin = irq;
- irq_attr.trigger = 1;
- /* irq_attr.polarity = 0; -> Active high */
- ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
- if (ret)
- return ret;
- } else {
+ /* IOAPIC builds identity mapping between GSI and IRQ on MID */
+ gsi = pdata->irq;
+ if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
+ mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
- irq);
+ gsi);
return -EINVAL;
}
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 994c40bd7cb7..3c53a90fdb18 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -432,9 +432,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
struct sfi_table_simple *sb;
struct sfi_device_table_entry *pentry;
struct devs_id *dev = NULL;
- int num, i;
- int ioapic;
- struct io_apic_irq_attr irq_attr;
+ int num, i, ret;
+ int polarity;
sb = (struct sfi_table_simple *)table;
num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -448,35 +447,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
* devices, but they have separate RTE entry in IOAPIC
* so we have to enable them one by one here
*/
- ioapic = mp_find_ioapic(irq);
- if (ioapic >= 0) {
- irq_attr.ioapic = ioapic;
- irq_attr.ioapic_pin = irq;
- irq_attr.trigger = 1;
- if (intel_mid_identify_cpu() ==
- INTEL_MID_CPU_CHIP_TANGIER) {
- if (!strncmp(pentry->name,
- "r69001-ts-i2c", 13))
- /* active low */
- irq_attr.polarity = 1;
- else if (!strncmp(pentry->name,
- "synaptics_3202", 14))
- /* active low */
- irq_attr.polarity = 1;
- else if (irq == 41)
- /* fast_int_1 */
- irq_attr.polarity = 1;
- else
- /* active high */
- irq_attr.polarity = 0;
- } else {
- /* PNW and CLV go with active low */
- irq_attr.polarity = 1;
- }
- io_apic_set_pci_routing(NULL, irq, &irq_attr);
+ if (intel_mid_identify_cpu() ==
+ INTEL_MID_CPU_CHIP_TANGIER) {
+ if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
+ /* active low */
+ polarity = 1;
+ else if (!strncmp(pentry->name,
+ "synaptics_3202", 14))
+ /* active low */
+ polarity = 1;
+ else if (irq == 41)
+ /* fast_int_1 */
+ polarity = 1;
+ else
+ /* active high */
+ polarity = 0;
+ } else {
+ /* PNW and CLV go with active low */
+ polarity = 1;
}
- } else {
- irq = 0; /* No irq */
+
+ ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
+ if (ret == 0)
+ ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+ WARN_ON(ret < 0);
}
dev = get_device_id(pentry->type, pentry->name);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index bcd1a703e3e6..2a8a74f3bd76 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/sfi.h>
#include <linux/io.h>
+#include <linux/irqdomain.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
@@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+};
static int __init sfi_parse_ioapic(struct sfi_table_header *table)
{
struct sfi_table_simple *sb;
struct sfi_apic_table_entry *pentry;
int i, num;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &sfi_ioapic_irqdomain_ops,
+ };
sb = (struct sfi_table_simple *)table;
num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
pentry = (struct sfi_apic_table_entry *)sb->pentry;
for (i = 0; i < num; i++) {
- mp_register_ioapic(i, pentry->phys_addr, gsi_top);
+ mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
pentry++;
}
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
index 9471b9456f25..baf16e72e668 100644
--- a/arch/x86/platform/ts5500/ts5500.c
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -1,7 +1,7 @@
/*
* Technologic Systems TS-5500 Single Board Computer support
*
- * Copyright (C) 2013 Savoir-faire Linux Inc.
+ * Copyright (C) 2013-2014 Savoir-faire Linux Inc.
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify it under
@@ -15,8 +15,8 @@
* state or available options. For further information about sysfs entries, see
* Documentation/ABI/testing/sysfs-platform-ts5500.
*
- * This code actually supports the TS-5500 platform, but it may be extended to
- * support similar Technologic Systems x86-based platforms, such as the TS-5600.
+ * This code may be extended to support similar x86-based platforms.
+ * Actually, the TS-5500 and TS-5400 are supported.
*/
#include <linux/delay.h>
@@ -32,6 +32,7 @@
/* Product code register */
#define TS5500_PRODUCT_CODE_ADDR 0x74
#define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */
+#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */
/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */
#define TS5500_SRAM_RS485_ADC_ADDR 0x75
@@ -66,6 +67,7 @@
/**
* struct ts5500_sbc - TS-5500 board description
+ * @name: Board model name.
* @id: Board product ID.
* @sram: Flag for SRAM option.
* @rs485: Flag for RS-485 option.
@@ -75,6 +77,7 @@
* @jumpers: Bitfield for jumpers' state.
*/
struct ts5500_sbc {
+ const char *name;
int id;
bool sram;
bool rs485;
@@ -122,13 +125,16 @@ static int __init ts5500_detect_config(struct ts5500_sbc *sbc)
if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500"))
return -EBUSY;
- tmp = inb(TS5500_PRODUCT_CODE_ADDR);
- if (tmp != TS5500_PRODUCT_CODE) {
- pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp);
+ sbc->id = inb(TS5500_PRODUCT_CODE_ADDR);
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ sbc->name = "TS-5500";
+ } else if (sbc->id == TS5400_PRODUCT_CODE) {
+ sbc->name = "TS-5400";
+ } else {
+ pr_err("ts5500: unknown product code 0x%x\n", sbc->id);
ret = -ENODEV;
goto cleanup;
}
- sbc->id = tmp;
tmp = inb(TS5500_SRAM_RS485_ADC_ADDR);
sbc->sram = tmp & TS5500_SRAM;
@@ -147,48 +153,52 @@ cleanup:
return ret;
}
-static ssize_t ts5500_show_id(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->id);
+ return sprintf(buf, "%s\n", sbc->name);
}
+static DEVICE_ATTR_RO(name);
-static ssize_t ts5500_show_jumpers(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t id_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+ return sprintf(buf, "0x%.2x\n", sbc->id);
}
+static DEVICE_ATTR_RO(id);
-#define TS5500_SHOW(field) \
- static ssize_t ts5500_show_##field(struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
- { \
- struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
- return sprintf(buf, "%d\n", sbc->field); \
- }
-
-TS5500_SHOW(sram)
-TS5500_SHOW(rs485)
-TS5500_SHOW(adc)
-TS5500_SHOW(ereset)
-TS5500_SHOW(itr)
+static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
-static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL);
-static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL);
-static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL);
-static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL);
-static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL);
-static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL);
-static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL);
+ return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+}
+static DEVICE_ATTR_RO(jumpers);
+
+#define TS5500_ATTR_BOOL(_field) \
+ static ssize_t _field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+ { \
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
+ \
+ return sprintf(buf, "%d\n", sbc->_field); \
+ } \
+ static DEVICE_ATTR_RO(_field)
+
+TS5500_ATTR_BOOL(sram);
+TS5500_ATTR_BOOL(rs485);
+TS5500_ATTR_BOOL(adc);
+TS5500_ATTR_BOOL(ereset);
+TS5500_ATTR_BOOL(itr);
static struct attribute *ts5500_attributes[] = {
&dev_attr_id.attr,
+ &dev_attr_name.attr,
&dev_attr_jumpers.attr,
&dev_attr_sram.attr,
&dev_attr_rs485.attr,
@@ -311,12 +321,14 @@ static int __init ts5500_init(void)
if (err)
goto error;
- ts5500_dio1_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio1_pdev))
- dev_warn(&pdev->dev, "DIO1 block registration failed\n");
- ts5500_dio2_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio2_pdev))
- dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ ts5500_dio1_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio1_pdev))
+ dev_warn(&pdev->dev, "DIO1 block registration failed\n");
+ ts5500_dio2_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio2_pdev))
+ dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ }
if (led_classdev_register(&pdev->dev, &ts5500_led_cdev))
dev_warn(&pdev->dev, "LED registration failed\n");
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dfe605ac1bcd..3968d67d366b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
/*
* The reverse of the above; converts a duration in ns to a duration in cycles.
- */
+ */
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
struct cyc2ns_data *data = cyc2ns_read_begin();
@@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
* UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
* But not currently used.
*/
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
+static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
{
unsigned long descriptor_status;
@@ -606,7 +606,7 @@ int handle_uv2_busy(struct bau_control *bcp)
return FLUSH_GIVEUP;
}
-static int uv2_wait_completion(struct bau_desc *bau_desc,
+static int uv2_3_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
@@ -616,7 +616,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;
- descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
/* spin on the status MMR, waiting for it to go idle */
while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -658,8 +658,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
/* not to hammer on the clock */
busy_reps = 0;
ttm = get_cycles();
- if ((ttm - bcp->send_message) >
- bcp->timeout_interval)
+ if ((ttm - bcp->send_message) > bcp->timeout_interval)
return handle_uv2_busy(bcp);
}
/*
@@ -667,8 +666,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
*/
cpu_relax();
}
- descriptor_stat = uv2_read_status(mmr_offset, right_shift,
- desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
@@ -679,8 +677,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
* which register to read and position in that register based on cpu in
* current hub.
*/
-static int wait_completion(struct bau_desc *bau_desc,
- struct bau_control *bcp, long try)
+static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
{
int right_shift;
unsigned long mmr_offset;
@@ -695,11 +692,9 @@ static int wait_completion(struct bau_desc *bau_desc,
}
if (bcp->uvhub_version == 1)
- return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
else
- return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
}
/*
@@ -888,7 +883,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster = bcp->uvhub_master;
struct uv1_bau_msg_header *uv1_hdr = NULL;
- struct uv2_bau_msg_header *uv2_hdr = NULL;
+ struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
if (bcp->uvhub_version == 1) {
uv1 = 1;
@@ -902,27 +897,28 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
if (uv1)
uv1_hdr = &bau_desc->header.uv1_hdr;
else
- uv2_hdr = &bau_desc->header.uv2_hdr;
+ /* uv2 and uv3 */
+ uv2_3_hdr = &bau_desc->header.uv2_3_hdr;
do {
if (try == 0) {
if (uv1)
uv1_hdr->msg_type = MSG_REGULAR;
else
- uv2_hdr->msg_type = MSG_REGULAR;
+ uv2_3_hdr->msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
if (uv1)
uv1_hdr->msg_type = MSG_RETRY;
else
- uv2_hdr->msg_type = MSG_RETRY;
+ uv2_3_hdr->msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
if (uv1)
uv1_hdr->sequence = seq_number;
else
- uv2_hdr->sequence = seq_number;
+ uv2_3_hdr->sequence = seq_number;
index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
bcp->send_message = get_cycles();
@@ -1080,8 +1076,10 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned int cpu)
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned int cpu)
{
int locals = 0;
int remotes = 0;
@@ -1268,6 +1266,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
if (bcp->uvhub_version == 2)
process_uv2_message(&msgdesc, bcp);
else
+ /* no error workaround for uv1 or uv3 */
bau_process_message(&msgdesc, bcp, 1);
msg++;
@@ -1325,8 +1324,12 @@ static void __init enable_timeouts(void)
*/
mmr_image |= (1L << SOFTACK_MSHIFT);
if (is_uv2_hub()) {
+ /* do not touch the legacy mode bit */
/* hw bug workaround; do not use extended status */
mmr_image &= ~(1L << UV2_EXT_SHFT);
+ } else if (is_uv3_hub()) {
+ mmr_image &= ~(1L << PREFETCH_HINT_SHFT);
+ mmr_image |= (1L << SB_STATUS_SHFT);
}
write_mmr_misc_control(pnode, mmr_image);
}
@@ -1476,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
return count;
}
- if (strict_strtol(optstr, 10, &input_arg) < 0) {
+ if (kstrtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
@@ -1692,7 +1695,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
struct bau_desc *bau_desc;
struct bau_desc *bd2;
struct uv1_bau_msg_header *uv1_hdr;
- struct uv2_bau_msg_header *uv2_hdr;
+ struct uv2_3_bau_msg_header *uv2_3_hdr;
struct bau_control *bcp;
/*
@@ -1739,15 +1742,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
*/
} else {
/*
- * BIOS uses legacy mode, but UV2 hardware always
+ * BIOS uses legacy mode, but uv2 and uv3 hardware always
* uses native mode for selective broadcasts.
*/
- uv2_hdr = &bd2->header.uv2_hdr;
- uv2_hdr->swack_flag = 1;
- uv2_hdr->base_dest_nasid =
+ uv2_3_hdr = &bd2->header.uv2_3_hdr;
+ uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->base_dest_nasid =
UV_PNODE_TO_NASID(base_pnode);
- uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv2_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
}
}
for_each_present_cpu(cpu) {
@@ -1858,6 +1861,7 @@ static int calculate_destination_timeout(void)
ts_ns *= (mult1 * mult2);
ret = ts_ns / 1000;
} else {
+ /* same destination timeout for uv2 and uv3 */
/* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
@@ -2012,8 +2016,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->uvhub_version = 1;
else if (is_uv2_hub())
bcp->uvhub_version = 2;
+ else if (is_uv3_hub())
+ bcp->uvhub_version = 3;
else {
- printk(KERN_EMERG "uvhub version not 1 or 2\n");
+ printk(KERN_EMERG "uvhub version not 1, 2 or 3\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
@@ -2138,9 +2144,10 @@ static int __init uv_bau_init(void)
}
vector = UV_BAU_MESSAGE;
- for_each_possible_blade(uvhub)
+ for_each_possible_blade(uvhub) {
if (uv_blade_nr_possible_cpus(uvhub))
init_uvhub(uvhub, vector, uv_base_pnode);
+ }
alloc_intr_gate(vector, uv_bau_message_intr1);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 424f4c97a44d..6ec7910f59bf 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -165,7 +165,7 @@ static void fix_processor_context(void)
* by __save_processor_state()
* @ctxt - structure to load the registers contents from
*/
-static void __restore_processor_state(struct saved_context *ctxt)
+static void notrace __restore_processor_state(struct saved_context *ctxt)
{
if (ctxt->misc_enable_saved)
wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
@@ -239,7 +239,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
}
/* Needed by apm.c */
-void restore_processor_state(void)
+void notrace restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
new file mode 100644
index 000000000000..899dd2454256
--- /dev/null
+++ b/arch/x86/purgatory/Makefile
@@ -0,0 +1,28 @@
+purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
+
+targets += $(purgatory-y)
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
+targets += purgatory.ro
+
+# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
+# in turn leaves some undefined symbols like __fentry__ in purgatory and not
+# sure how to relocate those. Like kexec-tools, use custom flags.
+
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large
+KBUILD_CFLAGS += -m$(BITS)
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+ $(call if_changed,ld)
+
+targets += kexec-purgatory.c
+
+quiet_cmd_bin2c = BIN2C $@
+ cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+ $(call if_changed,bin2c)
+
+
+obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o
diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S
new file mode 100644
index 000000000000..d1a4291d3568
--- /dev/null
+++ b/arch/x86/purgatory/entry64.S
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+
+ * Author(s): Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .balign 16
+ .code64
+ .globl entry64, entry64_regs
+
+
+entry64:
+ /* Setup a gdt that should be preserved */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup new stack */
+ leaq stack_init(%rip), %rsp
+ pushq $0x10 /* CS */
+ leaq new_cs_exit(%rip), %rax
+ pushq %rax
+ lretq
+new_cs_exit:
+
+ /* Load the registers */
+ movq rax(%rip), %rax
+ movq rbx(%rip), %rbx
+ movq rcx(%rip), %rcx
+ movq rdx(%rip), %rdx
+ movq rsi(%rip), %rsi
+ movq rdi(%rip), %rdi
+ movq rsp(%rip), %rsp
+ movq rbp(%rip), %rbp
+ movq r8(%rip), %r8
+ movq r9(%rip), %r9
+ movq r10(%rip), %r10
+ movq r11(%rip), %r11
+ movq r12(%rip), %r12
+ movq r13(%rip), %r13
+ movq r14(%rip), %r14
+ movq r15(%rip), %r15
+
+ /* Jump to the new code... */
+ jmpq *rip(%rip)
+
+ .section ".rodata"
+ .balign 4
+entry64_regs:
+rax: .quad 0x0
+rcx: .quad 0x0
+rdx: .quad 0x0
+rbx: .quad 0x0
+rsp: .quad 0x0
+rbp: .quad 0x0
+rsi: .quad 0x0
+rdi: .quad 0x0
+r8: .quad 0x0
+r9: .quad 0x0
+r10: .quad 0x0
+r11: .quad 0x0
+r12: .quad 0x0
+r13: .quad 0x0
+r14: .quad 0x0
+r15: .quad 0x0
+rip: .quad 0x0
+ .size entry64_regs, . - entry64_regs
+
+ /* GDT */
+ .section ".rodata"
+ .balign 16
+gdt:
+ /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+stack: .quad 0, 0
+stack_init:
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
new file mode 100644
index 000000000000..25e068ba3382
--- /dev/null
+++ b/arch/x86/purgatory/purgatory.c
@@ -0,0 +1,72 @@
+/*
+ * purgatory: Runs between two kernels
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "sha256.h"
+#include "../boot/string.h"
+
+struct sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+unsigned long backup_dest = 0;
+unsigned long backup_src = 0;
+unsigned long backup_sz = 0;
+
+u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 };
+
+struct sha_region sha_regions[16] = {};
+
+/*
+ * On x86, second kernel requries first 640K of memory to boot. Copy
+ * first 640K to a backup region in reserved memory range so that second
+ * kernel can use first 640K.
+ */
+static int copy_backup_region(void)
+{
+ if (backup_dest)
+ memcpy((void *)backup_dest, (void *)backup_src, backup_sz);
+
+ return 0;
+}
+
+int verify_sha256_digest(void)
+{
+ struct sha_region *ptr, *end;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state sctx;
+
+ sha256_init(&sctx);
+ end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])];
+ for (ptr = sha_regions; ptr < end; ptr++)
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+ sha256_final(&sctx, digest);
+
+ if (memcmp(digest, sha256_digest, sizeof(digest)))
+ return 1;
+
+ return 0;
+}
+
+void purgatory(void)
+{
+ int ret;
+
+ ret = verify_sha256_digest();
+ if (ret) {
+ /* loop forever */
+ for (;;)
+ ;
+ }
+ copy_backup_region();
+}
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
new file mode 100644
index 000000000000..fe3c91ba1bd0
--- /dev/null
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -0,0 +1,58 @@
+/*
+ * purgatory: setup code
+ *
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .globl purgatory_start
+ .balign 16
+purgatory_start:
+ .code64
+
+ /* Load a gdt so I know what the segment registers are */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup a stack */
+ leaq lstack_end(%rip), %rsp
+
+ /* Call the C code */
+ call purgatory
+ jmp entry64
+
+ .section ".rodata"
+ .balign 16
+gdt: /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as the gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+
+ .bss
+ .balign 4096
+lstack:
+ .skip 4096
+lstack_end:
diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c
new file mode 100644
index 000000000000..548ca675a14a
--- /dev/null
+++ b/arch/x86/purgatory/sha256.c
@@ -0,0 +1,283 @@
+/*
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include "sha256.h"
+#include "../boot/string.h"
+
+static inline u32 Ch(u32 x, u32 y, u32 z)
+{
+ return z ^ (x & (y ^ z));
+}
+
+static inline u32 Maj(u32 x, u32 y, u32 z)
+{
+ return (x & y) | (z & (x | y));
+}
+
+#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
+#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
+#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
+#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+ W[I] = __be32_to_cpu(((__be32 *)(input))[I]);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+ W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha256_transform(u32 *state, const u8 *input)
+{
+ u32 a, b, c, d, e, f, g, h, t1, t2;
+ u32 W[64];
+ int i;
+
+ /* load the input */
+ for (i = 0; i < 16; i++)
+ LOAD_OP(i, W, input);
+
+ /* now blend */
+ for (i = 16; i < 64; i++)
+ BLEND_OP(i, W);
+
+ /* load the state into our registers */
+ a = state[0]; b = state[1]; c = state[2]; d = state[3];
+ e = state[4]; f = state[5]; g = state[6]; h = state[7];
+
+ /* now iterate */
+ t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+ state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+ /* clear any sensitive info... */
+ a = b = c = d = e = f = g = h = t1 = t2 = 0;
+ memset(W, 0, 64 * sizeof(u32));
+}
+
+int sha256_init(struct sha256_state *sctx)
+{
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+{
+ unsigned int partial, done;
+ const u8 *src;
+
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
+ done = 0;
+ src = data;
+
+ if ((partial + len) > 63) {
+ if (partial) {
+ done = -partial;
+ memcpy(sctx->buf + partial, data, done + 64);
+ src = sctx->buf;
+ }
+
+ do {
+ sha256_transform(sctx->state, src);
+ done += 64;
+ src = data + done;
+ } while (done + 63 < len);
+
+ partial = 0;
+ }
+ memcpy(sctx->buf + partial, src, len - done);
+
+ return 0;
+}
+
+int sha256_final(struct sha256_state *sctx, u8 *out)
+{
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ unsigned int index, pad_len;
+ int i;
+ static const u8 padding[64] = { 0x80, };
+
+ /* Save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64. */
+ index = sctx->count & 0x3f;
+ pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha256_update(sctx, padding, pad_len);
+
+ /* Append length (before padding) */
+ sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Zeroize sensitive information. */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
new file mode 100644
index 000000000000..bd15a4127735
--- /dev/null
+++ b/arch/x86/purgatory/sha256.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author: Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#ifndef SHA256_H
+#define SHA256_H
+
+
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+extern int sha256_init(struct sha256_state *sctx);
+extern int sha256_update(struct sha256_state *sctx, const u8 *input,
+ unsigned int length);
+extern int sha256_final(struct sha256_state *sctx, u8 *hash);
+
+#endif /* SHA256_H */
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
new file mode 100644
index 000000000000..3cefba1fefc8
--- /dev/null
+++ b/arch/x86/purgatory/stack.S
@@ -0,0 +1,19 @@
+/*
+ * purgatory: stack
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ /* A stack for the loaded kernel.
+ * Seperate and in the data section so it can be prepopulated.
+ */
+ .data
+ .balign 4096
+ .globl stack, stack_end
+
+stack:
+ .skip 4096
+stack_end:
diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c
new file mode 100644
index 000000000000..d886b1fa36f0
--- /dev/null
+++ b/arch/x86/purgatory/string.c
@@ -0,0 +1,13 @@
+/*
+ * Simple string functions.
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "../boot/string.c"
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..028b78168d85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,6 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 seccomp sys_seccomp
+355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..35dd922727b9 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,10 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
+318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0feee2fd5077..25a1022dd793 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -216,6 +216,5 @@ extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
#define SET_PERSONALITY(ex) do ; while(0)
-#define __HAVE_ARCH_GATE_AREA 1
#endif
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 04f82e020f2b..2a206d2b14ab 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -25,7 +25,8 @@ static inline void rep_nop(void)
__asm__ __volatile__("rep;nop": : :"memory");
}
-#define cpu_relax() rep_nop()
+#define cpu_relax() rep_nop()
+#define cpu_relax_lowlatency() cpu_relax()
#include <asm/processor-generic.h>
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index c6492e75797b..f8fecaddcc0d 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return NULL;
}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 5e04a1c899fa..79d824551c1a 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -370,13 +370,12 @@ struct rt_sigframe
char retcode[8];
};
-int setup_signal_stack_sc(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- sigset_t *mask)
+int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
/* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
stack_top = ((stack_top + 4) & -16UL) - 4;
@@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
@@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) 0;
PT_REGS_CX(regs) = (unsigned long) 0;
return 0;
}
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- siginfo_t *info, sigset_t *mask)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct rt_sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
stack_top &= -8UL;
frame = (struct rt_sigframe __user *) stack_top - 1;
@@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
PT_REGS_SP(regs));
@@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) &frame->info;
PT_REGS_CX(regs) = (unsigned long) &frame->uc;
@@ -502,12 +500,11 @@ struct rt_sigframe
struct _fpstate fpstate;
};
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs * regs,
- siginfo_t *info, sigset_t *set)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *set)
{
struct rt_sigframe __user *frame;
- int err = 0;
+ int err = 0, sig = ksig->sig;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto out;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- err |= copy_siginfo_to_user(&frame->info, info);
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
if (err)
goto out;
}
@@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
* already in userspace.
*/
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER)
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
else
/* could use a vstub here */
return err;
@@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
*/
PT_REGS_SI(regs) = (unsigned long) &frame->info;
PT_REGS_DX(regs) = (unsigned long) &frame->uc;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
out:
return err;
}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 61b04fe36e66..5a4affe025e8 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -10,7 +10,7 @@ VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-y += vma.o
@@ -37,7 +37,8 @@ vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
obj-y += $(vdso_img_objs)
targets += $(vdso_img_cfiles)
targets += $(vdso_img_sodbg)
-.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c)
+.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \
+ $(vdso_img-y:%=$(obj)/vdso%.so)
export CPPFLAGS_vdso.lds += -P -C
@@ -54,10 +55,10 @@ hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
define cmd_vdso2c
- $(obj)/vdso2c $< $@
+ $(obj)/vdso2c $< $(<:%.dbg=%) $@
endef
-$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
$(call if_changed,vdso2c)
#
@@ -113,6 +114,10 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE
targets += vdsox32.lds $(vobjx32s-y)
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg
+ $(call if_changed,objcopy)
+
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
@@ -134,7 +139,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
-targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o
+targets += vdso32/vclock_gettime.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
@@ -156,7 +161,6 @@ $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
- $(obj)/vdso32/vdso-fakesections.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c
deleted file mode 100644
index aa5fbfab20a5..000000000000
--- a/arch/x86/vdso/vdso-fakesections.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright 2014 Andy Lutomirski
- * Subject to the GNU Public License, v.2
- *
- * String table for loadable section headers. See vdso2c.h for why
- * this exists.
- */
-
-const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) =
- ".hash\0"
- ".dynsym\0"
- ".dynstr\0"
- ".gnu.version\0"
- ".gnu.version_d\0"
- ".dynamic\0"
- ".rodata\0"
- ".fake_shstrtab\0" /* Yay, self-referential code. */
- ".note\0"
- ".eh_frame_hdr\0"
- ".eh_frame\0"
- ".text";
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 9197544eea9a..de2c921025f5 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -18,6 +18,25 @@
SECTIONS
{
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
+
+ vvar_start = . - 2 * PAGE_SIZE;
+ vvar_page = vvar_start;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ hpet_page = vvar_start + PAGE_SIZE;
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
@@ -74,31 +93,6 @@ SECTIONS
.altinstructions : { *(.altinstructions) } :text
.altinstr_replacement : { *(.altinstr_replacement) } :text
- /*
- * The remainder of the vDSO consists of special pages that are
- * shared between the kernel and userspace. It needs to be at the
- * end so that it doesn't overlap the mapping of the actual
- * vDSO image.
- */
-
- . = ALIGN(PAGE_SIZE);
- vvar_page = .;
-
- /* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
-#define __VVAR_KERNEL_LDS
-#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
-#undef EMIT_VVAR
-
- . = vvar_page + PAGE_SIZE;
-
- hpet_page = .;
- . = . + PAGE_SIZE;
-
- . = ALIGN(PAGE_SIZE);
- end_mapping = .;
-
/DISCARD/ : {
*(.discard)
*(.discard.*)
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 238dbe82776e..8627db24a7f6 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -1,3 +1,53 @@
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ * Licensed under the GPL v2
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
#include <inttypes.h>
#include <stdint.h>
#include <unistd.h>
@@ -20,9 +70,9 @@ const char *outfilename;
/* Symbols that we need in vdso2c. */
enum {
+ sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
- sym_end_mapping,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -38,9 +88,9 @@ struct vdso_sym {
};
struct vdso_sym required_syms[] = {
+ [sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
- [sym_end_mapping] = {"end_mapping", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
@@ -61,7 +111,8 @@ static void fail(const char *format, ...)
va_start(ap, format);
fprintf(stderr, "Error: ");
vfprintf(stderr, format, ap);
- unlink(outfilename);
+ if (outfilename)
+ unlink(outfilename);
exit(1);
va_end(ap);
}
@@ -96,9 +147,11 @@ extern void bad_put_le(void);
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
-#define BITSFUNC3(name, bits) name##bits
-#define BITSFUNC2(name, bits) BITSFUNC3(name, bits)
-#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS)
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
@@ -112,30 +165,53 @@ extern void bad_put_le(void);
#include "vdso2c.h"
#undef ELF_BITS
-static void go(void *addr, size_t len, FILE *outfile, const char *name)
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
{
- Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr;
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
- go64(addr, len, outfile, name);
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
} else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
- go32(addr, len, outfile, name);
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
} else {
fail("unknown ELF class\n");
}
}
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "%s", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
int main(int argc, char **argv)
{
- int fd;
- off_t len;
- void *addr;
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
FILE *outfile;
char *name, *tmp;
int namelen;
- if (argc != 3) {
- printf("Usage: vdso2c INPUT OUTPUT\n");
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
return 1;
}
@@ -143,7 +219,7 @@ int main(int argc, char **argv)
* Figure out the struct name. If we're writing to a .so file,
* generate raw output insted.
*/
- name = strdup(argv[2]);
+ name = strdup(argv[3]);
namelen = strlen(name);
if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
name = NULL;
@@ -159,26 +235,18 @@ int main(int argc, char **argv)
*tmp = '_';
}
- fd = open(argv[1], O_RDONLY);
- if (fd == -1)
- err(1, "%s", argv[1]);
-
- len = lseek(fd, 0, SEEK_END);
- if (len == (off_t)-1)
- err(1, "lseek");
-
- addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
- if (addr == MAP_FAILED)
- err(1, "mmap");
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
- outfilename = argv[2];
+ outfilename = argv[3];
outfile = fopen(outfilename, "w");
if (!outfile)
err(1, "%s", argv[2]);
- go(addr, (size_t)len, outfile, name);
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
- munmap(addr, len);
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
fclose(outfile);
return 0;
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index 11b65d4f9414..fd57829b30d8 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -4,139 +4,23 @@
* are built for 32-bit userspace.
*/
-/*
- * We're writing a section table for a few reasons:
- *
- * The Go runtime had a couple of bugs: it would read the section
- * table to try to figure out how many dynamic symbols there were (it
- * shouldn't have looked at the section table at all) and, if there
- * were no SHT_SYNDYM section table entry, it would use an
- * uninitialized value for the number of symbols. An empty DYNSYM
- * table would work, but I see no reason not to write a valid one (and
- * keep full performance for old Go programs). This hack is only
- * needed on x86_64.
- *
- * The bug was introduced on 2012-08-31 by:
- * https://code.google.com/p/go/source/detail?r=56ea40aac72b
- * and was fixed on 2014-06-13 by:
- * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
- *
- * Binutils has issues debugging the vDSO: it reads the section table to
- * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
- * would break build-id if we removed the section table. Binutils
- * also requires that shstrndx != 0. See:
- * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
- *
- * elfutils might not look for PT_NOTE if there is a section table at
- * all. I don't know whether this matters for any practical purpose.
- *
- * For simplicity, rather than hacking up a partial section table, we
- * just write a mostly complete one. We omit non-dynamic symbols,
- * though, since they're rather large.
- *
- * Once binutils gets fixed, we might be able to drop this for all but
- * the 64-bit vdso, since build-id only works in kernel RPMs, and
- * systems that update to new enough kernel RPMs will likely update
- * binutils in sync. build-id has never worked for home-built kernel
- * RPMs without manual symlinking, and I suspect that no one ever does
- * that.
- */
-struct BITSFUNC(fake_sections)
-{
- ELF(Shdr) *table;
- unsigned long table_offset;
- int count, max_count;
-
- int in_shstrndx;
- unsigned long shstr_offset;
- const char *shstrtab;
- size_t shstrtab_len;
-
- int out_shstrndx;
-};
-
-static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out,
- const char *name)
-{
- const char *outname = out->shstrtab;
- while (outname - out->shstrtab < out->shstrtab_len) {
- if (!strcmp(name, outname))
- return (outname - out->shstrtab) + out->shstr_offset;
- outname += strlen(outname) + 1;
- }
-
- if (*name)
- printf("Warning: could not find output name \"%s\"\n", name);
- return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */
-}
-
-static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out)
-{
- if (!out->in_shstrndx)
- fail("didn't find the fake shstrndx\n");
-
- memset(out->table, 0, out->max_count * sizeof(ELF(Shdr)));
-
- if (out->max_count < 1)
- fail("we need at least two fake output sections\n");
-
- PUT_LE(&out->table[0].sh_type, SHT_NULL);
- PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, ""));
-
- out->count = 1;
-}
-
-static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out,
- int in_idx, const ELF(Shdr) *in,
- const char *name)
-{
- uint64_t flags = GET_LE(&in->sh_flags);
-
- bool copy = flags & SHF_ALLOC &&
- (GET_LE(&in->sh_size) ||
- (GET_LE(&in->sh_type) != SHT_RELA &&
- GET_LE(&in->sh_type) != SHT_REL)) &&
- strcmp(name, ".altinstructions") &&
- strcmp(name, ".altinstr_replacement");
-
- if (!copy)
- return;
-
- if (out->count >= out->max_count)
- fail("too many copied sections (max = %d)\n", out->max_count);
-
- if (in_idx == out->in_shstrndx)
- out->out_shstrndx = out->count;
-
- out->table[out->count] = *in;
- PUT_LE(&out->table[out->count].sh_name,
- BITSFUNC(find_shname)(out, name));
-
- /* elfutils requires that a strtab have the correct type. */
- if (!strcmp(name, ".fake_shstrtab"))
- PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB);
-
- out->count++;
-}
-
-static void BITSFUNC(go)(void *addr, size_t len,
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
- unsigned long data_size;
- ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr;
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
int i;
unsigned long j;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
- uint64_t syms[NSYMS] = {};
-
- struct BITSFUNC(fake_sections) fake_sections = {};
+ INT_BITS syms[NSYMS] = {};
- ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff));
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
@@ -154,14 +38,16 @@ static void BITSFUNC(go)(void *addr, size_t len,
load_size = GET_LE(&pt[i].p_memsz);
found_load = 1;
} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
- dyn = addr + GET_LE(&pt[i].p_offset);
- dyn_end = addr + GET_LE(&pt[i].p_offset) +
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
GET_LE(&pt[i].p_memsz);
}
}
if (!found_load)
fail("no PT_LOAD seg\n");
- data_size = (load_size + 4095) / 4096 * 4096;
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
/* Walk the dynamic table */
for (i = 0; dyn + i < dyn_end &&
@@ -173,11 +59,11 @@ static void BITSFUNC(go)(void *addr, size_t len,
}
/* Walk the section table */
- secstrings_hdr = addr + GET_LE(&hdr->e_shoff) +
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
- secstrings = addr + GET_LE(&secstrings_hdr->sh_offset);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
- ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
@@ -190,7 +76,7 @@ static void BITSFUNC(go)(void *addr, size_t len,
if (!symtab_hdr)
fail("no symbol table\n");
- strtab_hdr = addr + GET_LE(&hdr->e_shoff) +
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
/* Walk the symbol table */
@@ -198,9 +84,9 @@ static void BITSFUNC(go)(void *addr, size_t len,
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
- ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
- const char *name = addr + GET_LE(&strtab_hdr->sh_offset) +
+ const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
@@ -209,51 +95,17 @@ static void BITSFUNC(go)(void *addr, size_t len,
fail("duplicate symbol %s\n",
required_syms[k].name);
}
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
syms[k] = GET_LE(&sym->st_value);
}
}
-
- if (!strcmp(name, "fake_shstrtab")) {
- ELF(Shdr) *sh;
-
- fake_sections.in_shstrndx = GET_LE(&sym->st_shndx);
- fake_sections.shstrtab = addr + GET_LE(&sym->st_value);
- fake_sections.shstrtab_len = GET_LE(&sym->st_size);
- sh = addr + GET_LE(&hdr->e_shoff) +
- GET_LE(&hdr->e_shentsize) *
- fake_sections.in_shstrndx;
- fake_sections.shstr_offset = GET_LE(&sym->st_value) -
- GET_LE(&sh->sh_addr);
- }
- }
-
- /* Build the output section table. */
- if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] ||
- !syms[sym_VDSO_FAKE_SECTION_TABLE_END])
- fail("couldn't find fake section table\n");
- if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
- syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr)))
- fail("fake section table size isn't a multiple of sizeof(Shdr)\n");
- fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START];
- fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START];
- fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
- syms[sym_VDSO_FAKE_SECTION_TABLE_START]) /
- sizeof(ELF(Shdr));
-
- BITSFUNC(init_sections)(&fake_sections);
- for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
- ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
- GET_LE(&hdr->e_shentsize) * i;
- BITSFUNC(copy_section)(&fake_sections, i, sh,
- secstrings + GET_LE(&sh->sh_name));
}
- if (!fake_sections.out_shstrndx)
- fail("didn't generate shstrndx?!?\n");
-
- PUT_LE(&hdr->e_shoff, fake_sections.table_offset);
- PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr)));
- PUT_LE(&hdr->e_shnum, fake_sections.count);
- PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx);
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
@@ -263,21 +115,23 @@ static void BITSFUNC(go)(void *addr, size_t len,
if (syms[i] % 4096)
fail("%s must be a multiple of 4096\n",
required_syms[i].name);
- if (syms[i] < data_size)
- fail("%s must be after the text mapping\n",
+ if (syms[sym_vvar_start] > syms[i] + 4096)
+ fail("%s underruns begin_vvar\n",
required_syms[i].name);
- if (syms[sym_end_mapping] < syms[i] + 4096)
- fail("%s overruns end_mapping\n",
+ if (syms[i] + 4096 > 0)
+ fail("%s is on the wrong side of the vdso text\n",
required_syms[i].name);
}
- if (syms[sym_end_mapping] % 4096)
- fail("end_mapping must be a multiple of 4096\n");
+ if (syms[sym_vvar_start] % 4096)
+ fail("vvar_begin must be a multiple of 4096\n");
if (!name) {
- fwrite(addr, load_size, 1, outfile);
+ fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
@@ -285,20 +139,21 @@ static void BITSFUNC(go)(void *addr, size_t len,
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __page_aligned_data = {",
- data_size);
- for (j = 0; j < load_size; j++) {
+ mapping_size);
+ for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
fprintf(outfile, "\n\t");
- fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]);
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[j]);
}
fprintf(outfile, "\n};\n\n");
fprintf(outfile, "static struct page *pages[%lu];\n\n",
- data_size / 4096);
+ mapping_size / 4096);
fprintf(outfile, "const struct vdso_image %s = {\n", name);
fprintf(outfile, "\t.data = raw_data,\n");
- fprintf(outfile, "\t.size = %lu,\n", data_size);
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
fprintf(outfile, "\t.text_mapping = {\n");
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
fprintf(outfile, "\t\t.pages = pages,\n");
@@ -311,8 +166,8 @@ static void BITSFUNC(go)(void *addr, size_t len,
}
for (i = 0; i < NSYMS; i++) {
if (required_syms[i].export && syms[i])
- fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n",
- required_syms[i].name, syms[i]);
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
}
fprintf(outfile, "};\n");
}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index e4f7781ee162..e904c270573b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void)
return 0;
}
__initcall(ia32_binfmt_init);
-#endif
-
-#else /* CONFIG_X86_32 */
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 5a5176de8d0a..970463b566cf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -93,7 +93,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long addr, text_start;
int ret = 0;
static struct page *no_pages[] = {NULL};
static struct vm_special_mapping vvar_mapping = {
@@ -103,26 +103,28 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
- image->sym_end_mapping);
+ image->size - image->sym_vvar_start);
} else {
addr = 0;
}
down_write(&mm->mmap_sem);
- addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0);
+ addr = get_unmapped_area(NULL, addr,
+ image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- current->mm->context.vdso = (void __user *)addr;
+ text_start = addr - image->sym_vvar_start;
+ current->mm->context.vdso = (void __user *)text_start;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*/
vma = _install_special_mapping(mm,
- addr,
+ text_start,
image->size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
@@ -134,9 +136,9 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
vma = _install_special_mapping(mm,
- addr + image->size,
- image->sym_end_mapping - image->size,
- VM_READ,
+ addr,
+ -image->sym_vvar_start,
+ VM_READ|VM_MAYREAD,
&vvar_mapping);
if (IS_ERR(vma)) {
@@ -146,7 +148,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (image->sym_vvar_page)
ret = remap_pfn_range(vma,
- addr + image->sym_vvar_page,
+ text_start + image->sym_vvar_page,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
@@ -157,7 +159,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
#ifdef CONFIG_HPET_TIMER
if (hpet_address && image->sym_hpet_page) {
ret = io_remap_pfn_range(vma,
- addr + image->sym_hpet_page,
+ text_start + image->sym_hpet_page,
hpet_address >> PAGE_SHIFT,
PAGE_SIZE,
pgprot_noncached(PAGE_READONLY));
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c09cb68..7322755f337a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
new file mode 100644
index 000000000000..a02e09e18f57
--- /dev/null
+++ b/arch/x86/xen/efi.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Oracle Co., Daniel Kiper
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <xen/xen-ops.h>
+
+#include <asm/setup.h>
+
+void __init xen_efi_init(void)
+{
+ efi_system_table_t *efi_systab_xen;
+
+ efi_systab_xen = xen_efi_probe();
+
+ if (efi_systab_xen == NULL)
+ return;
+
+ strncpy((char *)&boot_params.efi_info.efi_loader_signature, "Xen",
+ sizeof(boot_params.efi_info.efi_loader_signature));
+ boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
+ boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
+
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_PARAVIRT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ffb101e45731..c0cb11fb5008 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1718,6 +1718,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_setup_runstate_info(0);
+ xen_efi_init();
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
@@ -1826,8 +1828,19 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
+static bool xen_nopv = false;
+static __init int xen_parse_nopv(char *arg)
+{
+ xen_nopv = true;
+ return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
static uint32_t __init xen_hvm_platform(void)
{
+ if (xen_nopv)
+ return 0;
+
if (xen_pv_domain())
return 0;
@@ -1836,6 +1849,8 @@ static uint32_t __init xen_hvm_platform(void)
bool xen_hvm_need_lapic(void)
{
+ if (xen_nopv)
+ return false;
if (xen_pv_domain())
return false;
if (!xen_hvm_domain())
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index ebfa9b2c871d..1580e7a5a4cf 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -49,7 +49,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
-} gnttab_shared_vm_area, gnttab_status_vm_area;
+} gnttab_shared_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
@@ -73,43 +73,16 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
return 0;
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
-{
- grant_status_t *shared = *__shared;
- unsigned long addr;
- unsigned long i;
-
- if (shared == NULL)
- *__shared = shared = gnttab_status_vm_area.area->addr;
-
- addr = (unsigned long)shared;
-
- for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
- mfn_pte(frames[i], PAGE_KERNEL));
- addr += PAGE_SIZE;
- }
-
- return 0;
-}
-
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
- pte_t **ptes;
unsigned long addr;
unsigned long i;
- if (shared == gnttab_status_vm_area.area->addr)
- ptes = gnttab_status_vm_area.ptes;
- else
- ptes = gnttab_shared_vm_area.ptes;
-
addr = (unsigned long)shared;
for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ __pte(0));
addr += PAGE_SIZE;
}
}
@@ -129,35 +102,12 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
return 0;
}
-static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+int arch_gnttab_init(unsigned long nr_shared)
{
- free_vm_area(area->area);
- kfree(area->ptes);
-}
-
-int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
-{
- int ret;
-
if (!xen_pv_domain())
return 0;
- ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
- if (ret < 0)
- return ret;
-
- /*
- * Always allocate the space for the status frames in case
- * we're migrated to a host with V2 support.
- */
- ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
- if (ret < 0)
- goto err;
-
- return 0;
- err:
- arch_gnttab_vfree(&gnttab_shared_vm_area);
- return -ENOMEM;
+ return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
}
#ifdef CONFIG_XEN_PVH
@@ -168,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void)
{
struct page **pages;
xen_pfn_t *pfns;
+ void *vaddr;
int rc;
unsigned int i;
unsigned long nr_grant_frames = gnttab_max_grant_frames();
@@ -193,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void)
for (i = 0; i < nr_grant_frames; i++)
pfns[i] = page_to_pfn(pages[i]);
- rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
- &xen_auto_xlat_grant_frames.vaddr);
-
- if (rc) {
+ vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL);
+ if (!vaddr) {
pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
free_xenballooned_pages(nr_grant_frames, pages);
kfree(pages);
kfree(pfns);
- return rc;
+ return -ENOMEM;
}
kfree(pages);
xen_auto_xlat_grant_frames.pfn = pfns;
xen_auto_xlat_grant_frames.count = nr_grant_frames;
+ xen_auto_xlat_grant_frames.vaddr = vaddr;
return 0;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e8a1201c3293..16fb0099b7f2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1866,12 +1866,11 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
*
* We can construct this by grafting the Xen provided pagetable into
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working. We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- * NOTE: for PVH, the page tables are native.
+ * level2_ident_pgt, and level2_kernel_pgt. This means that only the
+ * kernel has a physical mapping to start with - but that's enough to
+ * get __va working. We need to fill in the rest of the physical
+ * mapping once some sort of allocator has been set up. NOTE: for
+ * PVH, the page tables are native.
*/
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
@@ -1902,8 +1901,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
/* L3_k[510] -> level2_kernel_pgt
- * L3_i[511] -> level2_fixmap_pgt */
+ * L3_k[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
+
+ /* L3_k[511][506] -> level1_fixmap_pgt */
+ convert_pfn_mfn(level2_fixmap_pgt);
}
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
@@ -1913,21 +1915,15 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
addr[1] = (unsigned long)l3;
addr[2] = (unsigned long)l2;
/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
- * Both L4[272][0] and L4[511][511] have entries that point to the same
+ * Both L4[272][0] and L4[511][510] have entries that point to the same
* L2 (PMD) tables. Meaning that if you modify it in __va space
* it will be also modified in the __ka space! (But if you just
* modify the PMD table to point to other PTE's or none, then you
* are OK - which is what cleanup_highmap does) */
copy_page(level2_ident_pgt, l2);
- /* Graft it onto L4[511][511] */
+ /* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
- /* Get [511][510] and graft that in level2_fixmap_pgt */
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
- copy_page(level2_fixmap_pgt, l2);
- /* Note that we don't do anything with level1_fixmap_pgt which
- * we don't need. */
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1937,6 +1933,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9bb3d82ffec8..3172692381ae 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -841,10 +841,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
pfn = ALIGN(pfn, P2M_PER_PAGE);
}
- if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s)))
- printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+ (pfn_e - pfn_s) - (pfn - pfn_s));
return pfn - pfn_s;
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 7b78f88c1707..5718b0b58b60 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -444,7 +444,7 @@ void xen_setup_timer(int cpu)
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
- IRQF_FORCE_RESUME,
+ IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
name, NULL);
(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 97d87659f779..28c7e0be56e4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -105,6 +105,14 @@ static inline void __init xen_init_apic(void)
}
#endif
+#ifdef CONFIG_XEN_EFI
+extern void xen_efi_init(void);
+#else
+static inline void __init xen_efi_init(void)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \