diff options
Diffstat (limited to 'arch/x86/kernel')
101 files changed, 2790 insertions, 1683 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..56ebd1f98447 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 406ed77216d0..7c439fe4941b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else acpi_register_lapic(apic_id, enabled); @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -642,6 +642,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) kfree(buffer.pointer); buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; + lapic = NULL; if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) goto out; @@ -650,7 +651,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_tmp_map; cpumask_copy(tmp_map, cpu_present_mask); - acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); + acpi_register_lapic(physid, ACPI_MADT_ENABLED); /* * If mp_register_lapic successfully generates a new logical cpu diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f50e7fb2a201..d2b7f27781bc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,6 +14,7 @@ #include <acpi/processor.h> #include <asm/acpi.h> #include <asm/mwait.h> +#include <asm/special_insns.h> /* * Initialize bm_flags based on the CPU cache properties diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 103b6ab368d3..146a49c763a4 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -24,6 +24,10 @@ unsigned long acpi_realmode_flags; static char temp_stack[4096]; #endif +asmlinkage void acpi_enter_s3(void) +{ + acpi_enter_sleep_state(3, wake_sleep_flags); +} /** * acpi_suspend_lowlevel - save kernel state * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 416d4be13fef..d68677a2a010 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -3,12 +3,16 @@ */ #include <asm/trampoline.h> +#include <linux/linkage.h> extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; +extern u8 wake_sleep_flags; +extern asmlinkage void acpi_enter_s3(void); + extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 13ab720573e3..72610839f03b 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -74,9 +74,7 @@ restore_registers: ENTRY(do_suspend_lowlevel) call save_processor_state call save_registers - pushl $3 - call acpi_enter_sleep_state - addl $4, %esp + call acpi_enter_s3 # In case of S3 failure, we'll emerge here. Jump # to ret_point to recover diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8ea5164cbd04..014d1d28c397 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -71,9 +71,7 @@ ENTRY(do_suspend_lowlevel) movq %rsi, saved_rsi addq $8, %rsp - movl $3, %edi - xorl %eax, %eax - call acpi_enter_sleep_state + call acpi_enter_s3 /* in case something went wrong, restore the machine status and go on */ jmp resume_point diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b1e7c7f7a0af..e66311200cbd 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -477,7 +477,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag) + gfp_t flag, struct dma_attrs *attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, } __free_pages(page, get_order(size)); } else - return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + return dma_generic_alloc_coherent(dev, size, dma_addr, flag, + attrs); return NULL; } @@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); free_pages((unsigned long)vaddr, get_order(size)); @@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, .map_page = gart_map_page, .unmap_page = gart_unmap_page, - .alloc_coherent = gart_alloc_coherent, - .free_coherent = gart_free_coherent, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, .mapping_error = gart_mapping_error, }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2eec05b6d1b8..39a222e094af 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/irq_remapping.h> #include <asm/perf_event.h> #include <asm/x86_init.h> #include <asm/pgalloc.h> @@ -383,20 +384,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) static unsigned int reserve_eilvt_offset(int offset, unsigned int new) { - unsigned int rsvd; /* 0: uninitialized */ + unsigned int rsvd, vector; if (offset >= APIC_EILVT_NR_MAX) return ~0; - rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; + rsvd = atomic_read(&eilvt_offsets[offset]); do { - if (rsvd && - !eilvt_entry_is_changeable(rsvd, new)) + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); } while (rsvd != new); + rsvd &= ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + return new; } @@ -1320,11 +1326,13 @@ void __cpuinit setup_local_APIC(void) acked); break; } - if (cpu_has_tsc) { - rdtscll(ntsc); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; + if (queued) { + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } } while (queued && max_loops > 0); WARN_ON(max_loops <= 0); @@ -1436,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void) * Now that local APIC setup is completed for BP, configure the fault * handling for interrupt remapping. */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); + if (irq_remapping_enabled) + irq_remap_enable_fault_handling(); } @@ -1512,7 +1520,7 @@ void enable_x2apic(void) int __init enable_IR(void) { #ifdef CONFIG_IRQ_REMAP - if (!intr_remapping_supported()) { + if (!irq_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return -1; } @@ -1523,7 +1531,7 @@ int __init enable_IR(void) return -1; } - return enable_intr_remapping(); + return irq_remapping_enable(); #endif return -1; } @@ -1532,10 +1540,13 @@ void __init enable_IR_x2apic(void) { unsigned long flags; int ret, x2apic_enabled = 0; - int dmar_table_init_ret; + int hardware_init_ret; - dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret && !x2apic_supported()) + /* Make sure irq_remap_ops are initialized */ + setup_irq_remapping_ops(); + + hardware_init_ret = irq_remapping_prepare(); + if (hardware_init_ret && !x2apic_supported()) return; ret = save_ioapic_entries(); @@ -1551,7 +1562,7 @@ void __init enable_IR_x2apic(void) if (x2apic_preenabled && nox2apic) disable_x2apic(); - if (dmar_table_init_ret) + if (hardware_init_ret) ret = -1; else ret = enable_IR(); @@ -1632,9 +1643,11 @@ static int __init apic_verify(void) mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + } pr_info("Found and enabled local APIC!\n"); return 0; @@ -1652,13 +1665,15 @@ int __init apic_force_enable(unsigned long addr) * MSR. This can only be done in software for Intel P6 or later * and AMD K7 (Model > 1) or later. */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | addr; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } } return apic_verify(); } @@ -2167,8 +2182,8 @@ static int lapic_suspend(void) local_irq_save(flags); disable_local_APIC(); - if (intr_remapping_enabled) - disable_intr_remapping(); + if (irq_remapping_enabled) + irq_remapping_disable(); local_irq_restore(flags); return 0; @@ -2184,7 +2199,7 @@ static void lapic_resume(void) return; local_irq_save(flags); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { /* * IO-APIC and PIC have their own resume routines. * We just mask them here to make sure the interrupt @@ -2204,10 +2219,12 @@ static void lapic_resume(void) * FIXME! This will be wrong if we ever support suspend on * SMP! We'll need to do this as part of the CPU restore! */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } } maxlvt = lapic_get_maxlvt(); @@ -2234,8 +2251,8 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) - reenable_intr_remapping(x2apic_mode); + if (irq_remapping_enabled) + irq_remapping_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36c..0e881c46e8c8 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -227,6 +227,7 @@ static struct apic apic_flat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -386,6 +387,7 @@ static struct apic apic_physflat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c9..a6e4c6e06c08 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -181,6 +181,7 @@ struct apic apic_noop = { .read = noop_apic_read, .write = noop_apic_write, + .eoi_write = noop_apic_write, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index d9ea5f331ac5..6ec6d5d297c3 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -207,8 +207,11 @@ static void __init map_csrs(void) static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - c->phys_proc_id = node; - per_cpu(cpu_llc_id, smp_processor_id()) = node; + + if (c->phys_proc_id != node) { + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; + } } static int __init numachip_system_init(void) @@ -229,11 +232,10 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; - setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } @@ -293,6 +295,7 @@ static struct apic apic_numachip __refconst = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065aff..31fbdbfbf960 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -248,6 +248,7 @@ static struct apic apic_bigsmp = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b9134..db4ab1be3c79 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d10a66fc5a9..ffdc152e507d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -64,9 +64,26 @@ #include <asm/apic.h> #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) +#ifdef CONFIG_IRQ_REMAP +static void irq_remap_modify_chip_defaults(struct irq_chip *chip); +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return cfg->irq_2_iommu.iommu != NULL; +} +#else +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return false; +} +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +{ +} +#endif + /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -294,6 +311,7 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -314,16 +332,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -334,7 +353,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -343,29 +362,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - int pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -377,6 +373,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; } @@ -384,9 +381,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; } @@ -396,8 +395,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { union entry_union eu = {{0, 0}}; @@ -409,6 +407,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -435,8 +434,7 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int -__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -521,6 +519,7 @@ static void io_apic_sync(struct irq_pin_list *entry) * a dummy read from the IO-APIC */ struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } @@ -1322,77 +1321,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi ? "fasteoi" : "edge"); } - -static int setup_ir_ioapic_entry(int irq, - struct IR_IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - int index; - struct irte irte; - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); - - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - return -ENODEV; - } - - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); - return -ENOMEM; - } - - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) { - if (intr_remapping_enabled) - return setup_ir_ioapic_entry(irq, - (struct IR_IO_APIC_route_entry *)entry, - destination, vector, attr); + if (irq_remapping_enabled) + return setup_ioapic_remapped_entry(irq, entry, destination, + vector, attr); memset(entry, 0, sizeof(*entry)); @@ -1549,7 +1484,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, { struct IO_APIC_route_entry entry; - if (intr_remapping_enabled) + if (irq_remapping_enabled) return; memset(&entry, 0, sizeof(entry)); @@ -1635,7 +1570,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" " Pol Stat Indx2 Zero Vect:\n"); } else { @@ -1644,7 +1579,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) } for (i = 0; i <= reg_01.bits.entries; i++) { - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { struct IO_APIC_route_entry entry; struct IR_IO_APIC_route_entry *ir_entry; @@ -2011,7 +1946,7 @@ void disable_IO_APIC(void) * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { + if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2035,7 +1970,7 @@ void disable_IO_APIC(void) * Use virtual wire A mode when interrupt remapping is enabled. */ if (cpu_has_apic || apic_from_smp_config()) - disconnect_bsp_APIC(!intr_remapping_enabled && + disconnect_bsp_APIC(!irq_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2351,71 +2286,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_IRQ_REMAP - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For both level and edge triggered, irq migration is a simple atomic - * update(of vector and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we eliminate the io-apic RTE modification (with the - * updated vector information), by using a virtual vector (io-apic pin number). - * Real vector that is used for interrupting cpu will be coming from - * the interrupt-remapping table entry. - * - * As the migration is a simple atomic update of IRTE, the same mechanism - * is used to migrate MSI irq's in the presence of interrupt-remapping. - */ -static int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - return 0; -} - -#else -static inline int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - return 0; -} -#endif - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2512,21 +2382,96 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; -static void ack_apic_level(struct irq_data *data) +#ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { - struct irq_cfg *cfg = data->chip_data; - int i, do_unmask_irq = 0, irq = data->irq; - unsigned long v; + struct irq_pin_list *entry; + unsigned long flags; - irq_complete_move(cfg); -#ifdef CONFIG_GENERIC_PENDING_IRQ + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - do_unmask_irq = 1; mask_ioapic(cfg); + return true; } + return false; +} + +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ + if (unlikely(masked)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(cfg); + } +} +#else +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ + return false; +} +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ +} #endif +static void ack_apic_level(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + int i, irq = data->irq; + unsigned long v; + bool masked; + + irq_complete_move(cfg); + masked = ioapic_irqd_mask(data, cfg); + /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2581,38 +2526,7 @@ static void ack_apic_level(struct irq_data *data) eoi_ioapic_irq(irq, cfg); } - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(cfg)) - irq_move_masked_irq(data); - unmask_ioapic(cfg); - } + ioapic_irqd_unmask(data, cfg, masked); } #ifdef CONFIG_IRQ_REMAP @@ -2639,7 +2553,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_eoi = ir_ack_apic_level; #ifdef CONFIG_SMP - chip->irq_set_affinity = ir_ioapic_set_affinity; + chip->irq_set_affinity = set_remapped_irq_affinity; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2852,7 +2766,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; @@ -2885,7 +2799,7 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); @@ -3109,7 +3023,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); if (irq_remapped(cfg)) - free_irte(irq); + free_remapped_irq(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -3138,54 +3052,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); if (irq_remapped(cfg)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - prepare_irte(&irte, cfg->vector, dest); - - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); - - modify_irte(irq, &irte); + compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); + return err; + } + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else { - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(dest); - else - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } return err; } @@ -3228,33 +3122,6 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} - static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { struct irq_chip *chip = &msi_chip; @@ -3285,7 +3152,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; struct msi_desc *msidesc; - struct intel_iommu *iommu = NULL; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3299,7 +3165,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; - if (!intr_remapping_enabled) + if (!irq_remapping_enabled) goto no_ir; if (!sub_handle) { @@ -3307,23 +3173,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) * allocate the consecutive block of IRTE's * for 'nvec' */ - index = msi_alloc_irte(dev, irq, nvec); + index = msi_alloc_remapped_irq(dev, irq, nvec); if (index < 0) { ret = index; goto error; } } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; + ret = msi_setup_remapped_irq(dev, irq, index, + sub_handle); + if (ret < 0) goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); } no_ir: ret = setup_msi_irq(dev, msidesc, irq); @@ -3441,15 +3300,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) struct msi_msg msg; int ret; - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_hpet_to_ir(id); - int index; - - if (!iommu) - return -1; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) + if (irq_remapping_enabled) { + if (!setup_hpet_msi_remapped(irq, id)) return -1; } @@ -3828,8 +3680,8 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - if (intr_remapping_enabled) - ir_ioapic_set_affinity(idata, mask, false); + if (irq_remapping_enabled) + set_remapped_irq_affinity(idata, mask, false); else ioapic_set_affinity(idata, mask, false); } @@ -3871,7 +3723,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_and_gsi_init(void) +void __init native_io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c9..f00a68cca37a 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4d..1b291da09e60 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -142,6 +142,7 @@ static struct apic apic_default = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f07..659897c00755 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -546,6 +546,7 @@ static struct apic apic_summit = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 9193713060a9..ff35cff0e1a7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,7 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bcd1db6eaca9..c17e982db275 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -24,6 +24,12 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (x2apic_phys) return x2apic_enabled(); + else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) && + x2apic_enabled()) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return 1; + } else return 0; } @@ -119,7 +125,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, @@ -166,6 +172,7 @@ static struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index fc4771425852..c6d03f7a4401 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } +static int uv_apic_id_valid(int apicid) +{ + return 1; +} + static int uv_apic_id_registered(void) { return 1; @@ -351,7 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, @@ -399,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d56931a15b3..07b0c0db466c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -231,7 +231,6 @@ #include <linux/syscore_ops.h> #include <linux/i8253.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/olpc.h> @@ -2402,7 +2401,7 @@ static void __exit apm_exit(void) * (pm_idle), Wait for all processors to update cached/local * copies of pm_idle before proceeding. */ - cpu_idle_wait(); + kick_all_cpus_sync(); } if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 834e897b1e25..1b4754f82ba7 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,6 +1,12 @@ #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif static char syscalls_64[] = { #include <asm/syscalls_64.h> }; diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8ddc..e2dbcb7dabdd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -27,21 +27,29 @@ static int num_scan_areas; static __init int set_corruption_check(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - memory_corruption_check = simple_strtol(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + memory_corruption_check = val; + return 0; } early_param("memory_corruption_check", set_corruption_check); static __init int set_corruption_check_period(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - corruption_check_period = simple_strtoul(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + corruption_check_period = val; + return 0; } early_param("memory_corruption_check_period", set_corruption_check_period); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0a44b90602b0..146bb6218eec 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -26,7 +26,8 @@ * contact AMD for precise details and a CPU swap. * * See http://www.multimania.com/poulot/k6bug.html - * http://www.amd.com/K6/k6docs/revgd.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) * * The following test is erm.. interesting. AMD neglected to up * the chip setting when fixing the bug but they also tweaked some @@ -94,7 +95,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) "system stability may be impaired when more than 32 MB are used.\n"); else printk(KERN_CONT "probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } /* K6 with old style WHCR */ @@ -353,10 +353,11 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = per_cpu(cpu_llc_id, cpu); /* - * If core numbers are inconsistent, it's likely a multi-fabric platform, - * so invoke platform-specific handler + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. */ - if (c->phys_proc_id != node) + if (x86_cpuinit.fixup_cpu_id) x86_cpuinit.fixup_cpu_id(c, node); if (!node_online(node)) { @@ -579,6 +580,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* re-enable TopologyExtensions if switched off by BIOS */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + u64 val; + + if (!rdmsrl_amd_safe(0xc0011005, &val)) { + val |= 1ULL << 54; + wrmsrl_amd_safe(0xc0011005, val); + rdmsrl(0xc0011005, val); + if (val & (1ULL << 54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + printk(KERN_INFO FW_INFO "CPU: Re-enabling " + "disabled Topology Extensions Support\n"); + } + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e49477444fff..82f29e70d058 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -999,7 +999,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); - __print_cpu_msr(); + print_cpu_msr(c); } void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) @@ -1163,15 +1163,6 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* - * Prints an error where the NUMA and configured core-number mismatch and the - * platform didn't override this to fix it up - */ -void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) -{ - pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); -} - -/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a @@ -1194,7 +1185,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(numa_node) == 0 && + if (cpu != 0 && this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 73d08ed98a64..9a7c90d80bc4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -433,14 +433,14 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, /* check if @slot is already used or the index is already disabled */ ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) - return -EINVAL; + return -EEXIST; if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EINVAL; + return -EEXIST; amd_l3_disable_index(nb, cpu, slot, index); @@ -468,8 +468,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) - printk(KERN_WARNING "L3 disable slot %d in use!\n", - slot); + pr_warning("L3 slot %d in use/index already disabled!\n", + slot); return err; } return count; @@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid >> index_msb; + l2_id = c->apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order( num_threads_sharing); - l3_id = c->apicid >> index_msb; + l3_id = c->apicid & ~((1 << index_msb) - 1); break; default: break; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341b..36565373af87 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -23,7 +23,7 @@ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) * * Arrays used to match for this should also be declared using - * MODULE_DEVICE_TABLE(x86_cpu, ...) + * MODULE_DEVICE_TABLE(x86cpu, ...) * * This always matches against the boot cpu, assuming models and features are * consistent over all CPUs. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..2afcbd253e1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); @@ -945,9 +945,10 @@ struct mce_info { atomic_t inuse; struct task_struct *t; __u64 paddr; + int restartable; } mce_info[MCE_INFO_MAX]; -static void mce_save_info(__u64 addr) +static void mce_save_info(__u64 addr, int c) { struct mce_info *mi; @@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr) if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { mi->t = current; mi->paddr = addr; + mi->restartable = c; return; } } @@ -1015,7 +1017,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - percpu_inc(mce_exception_count); + this_cpu_inc(mce_exception_count); if (!banks) goto out; @@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { /* schedule action before return to userland */ - mce_save_info(m.addr); + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); set_thread_flag(TIF_MCE_NOTIFY); } else if (kill_it) { force_sig(SIGBUS, current); @@ -1179,7 +1181,13 @@ void mce_notify_process(void) pr_err("Uncorrected hardware memory error in user-access at %llx", mi->paddr); - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || + mi->restartable == 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } @@ -1423,6 +1431,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT(62)) { + val &= ~BIT(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 99b57179f912..f4873a64f46d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -51,6 +51,7 @@ struct threshold_block { unsigned int cpu; u32 address; u16 interrupt_enable; + bool interrupt_capable; u16 threshold_limit; struct kobject kobj; struct list_head miscj; @@ -83,6 +84,21 @@ struct thresh_restart { u16 old_limit; }; +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr) (new_count & THRESHOLD_MAX); } + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + if (tr->set_lvt_off) { if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { /* set new lvt offset */ @@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr) } } - tr->b->interrupt_enable ? - (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (hi &= ~MASK_INT_TYPE_HI); + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: hi |= MASK_COUNT_EN_HI; wrmsr(tr->b->address, lo, hi); @@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; - offset = setup_APIC_mce(offset, - (high & MASK_LVTOFF_HI) >> 20); - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); + + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; @@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; @@ -390,10 +421,10 @@ RW_ATTR(threshold_limit); RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9bc..2d5454cd2c4f 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,7 +9,6 @@ #include <linux/smp.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 67bb17a37a0a..47a1870279aa 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,7 +25,6 @@ #include <linux/cpu.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f565974..2d7998fb628c 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,7 +8,6 @@ #include <linux/init.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 97b26356e9ee..75772ae6c65f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -12,7 +12,6 @@ #include <asm/processor-flags.h> #include <asm/cpufeature.h> #include <asm/tlbflush.h> -#include <asm/system.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 79289632cb27..a041e094b8b9 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long base; unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; @@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; + gentry.base = base << PAGE_SHIFT; gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that would overflow */ if (size != (__typeof__(gentry.size))size) gentry.base = gentry.size = gentry.type = 0; else { + gentry.base = base; gentry.size = size; gentry.type = type; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fa2900c0e398..e049d6da0183 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -29,7 +29,6 @@ #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> -#include <asm/compat.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/timer.h> @@ -485,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; - - /* mark not used */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); @@ -1187,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1223,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; @@ -1314,6 +1308,11 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1388,6 +1387,7 @@ static int __init init_hw_perf_events(void) } x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + x86_pmu_format_group.attrs = x86_pmu.format_attrs; pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); @@ -1616,6 +1616,9 @@ static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; + if (!x86_pmu.attr_rdpmc) + return 0; + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { idx -= X86_PMC_IDX_FIXED; idx |= 1 << 30; @@ -1668,6 +1671,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, + &x86_pmu_format_group, NULL, }; @@ -1699,14 +1703,19 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + userpg->cap_usr_time = 0; + userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; + userpg->cap_usr_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; @@ -1748,6 +1757,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) } #ifdef CONFIG_COMPAT + +#include <asm/compat.h> + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8484e77c211e..6638aaf54493 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -339,6 +339,7 @@ struct x86_pmu { * sysfs attrs */ int attr_rdpmc; + struct attribute **format_attrs; /* * CPU Hotplug hooks diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index dd002faff7a6..11a4eb9131d5 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) static int amd_pmu_hw_config(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); + int ret; + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + ret = x86_pmu_hw_config(event); if (ret) return ret; @@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } @@ -404,6 +407,21 @@ static void amd_pmu_cpu_dead(int cpu) } } +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -426,6 +444,8 @@ static __initconst const struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, @@ -476,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x023 DE PERF_CTL[2:0] * 0x02D LS PERF_CTL[3] * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) * 0x043 CU PERF_CTL[2:0] * 0x045 CU PERF_CTL[2:0] * 0x046 CU PERF_CTL[2:0] @@ -489,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DD LS PERF_CTL[5:0] * 0x0DE LS PERF_CTL[5:0] * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] * - * (*) depending on the umask all FPU counters may be used + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -542,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev return &amd_f15_PMC3; case 0x02E: return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; default: return &amd_f15_PMC50; } @@ -596,6 +625,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .cpu_dead = amd_pmu_cpu_dead, #endif .cpu_starting = amd_pmu_cpu_starting, + .format_attrs = amd_format_attr, }; __init int amd_pmu_init(void) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14e..da9bcdcd9856 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@ #include <linux/perf_event.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/ptrace.h> #include <asm/apic.h> @@ -16,36 +17,591 @@ static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + struct cpu_perf_ibs __percpu *pcpu; + u64 (*get_count)(u64 config); +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} static int perf_ibs_init(struct perf_event *event) { - if (perf_ibs.type != event->attr.type) + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + int ret; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) return -ENOENT; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + return 0; } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + static int perf_ibs_add(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + return 0; } static void perf_ibs_del(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); } -static struct pmu perf_ibs = { - .event_init= perf_ibs_init, - .add= perf_ibs_add, - .del= perf_ibs_del, +static void perf_ibs_read(struct perf_event *event) { } + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, }; +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, +}; + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period; + + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incomming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + } + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + instruction_pointer_set(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.size = sizeof(u32) + ibs_data.size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); + + return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs, "ibs", -1); + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6a84e7f28f05..166546ec6aef 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1082,7 +1080,7 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; @@ -1431,6 +1429,24 @@ static void core_pmu_enable_all(int added) } } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1455,6 +1471,7 @@ static __initconst const struct x86_pmu core_pmu = { .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1553,6 +1570,21 @@ static void intel_pmu_flush_branch_stack(void) intel_pmu_lbr_reset(); } +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + NULL, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1576,6 +1608,8 @@ static __initconst const struct x86_pmu intel_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .format_attrs = intel_arch3_formats_attr, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7dd..5a3edc27f6e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs.ip = 0; /* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ef484d9d0a25..47124a73dd73 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1271,6 +1271,17 @@ done: return num ? -EINVAL : 0; } +PMU_FORMAT_ATTR(cccr, "config:0-31" ); +PMU_FORMAT_ATTR(escr, "config:32-62"); +PMU_FORMAT_ATTR(ht, "config:63" ); + +static struct attribute *intel_p4_formats_attr[] = { + &format_attr_cccr.attr, + &format_attr_escr.attr, + &format_attr_ht.attr, + NULL, +}; + static __initconst const struct x86_pmu p4_pmu = { .name = "Netburst P4/Xeon", .handle_irq = p4_pmu_handle_irq, @@ -1305,6 +1316,8 @@ static __initconst const struct x86_pmu p4_pmu = { * the former idea is taken from OProfile code */ .perfctr_second_write = 1, + + .format_attrs = intel_p4_formats_attr, }; __init int p4_pmu_init(void) diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index c7181befecde..32bcfc7dd230 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event) (void)checking_wrmsrl(hwc->config_base, val); } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = x86_pmu_handle_irq, @@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = { .cntval_mask = (1ULL << 32) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, }; __init int p6_pmu_init(void) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f2..39472dd2323f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,7 +43,6 @@ #include <asm/processor.h> #include <asm/msr.h> -#include <asm/system.h> static struct class *cpuid_class; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4025fe4f928f..571246d81edf 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, struct thread_info *tinfo, int *graph) { - struct task_struct *task = tinfo->task; + struct task_struct *task; unsigned long ret_addr; - int index = task->curr_ret_stack; + int index; if (addr != (unsigned long)return_to_handler) return; + task = tinfo->task; + index = task->curr_ret_stack; + if (!task->ret_stack || index < *graph) return; @@ -265,10 +268,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; - show_registers(regs); + show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { sp = regs->sp; @@ -308,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err) static int __init kstack_setup(char *s) { + ssize_t ret; + unsigned long val; + if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + kstack_depth_to_print = val; return 0; } early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { - code_bytes = simple_strtoul(s, NULL, 0); + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + + code_bytes = val; if (code_bytes > 8192) code_bytes = 8192; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 88ec9129271d..e0b1d783daab 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 17107bd6e1f0..791b76122aa8 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; unsigned long sp; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7b784f4ef1e4..01ccf9b71473 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -56,6 +56,7 @@ #include <asm/irq_vectors.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> +#include <asm/asm.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -151,10 +152,8 @@ .pushsection .fixup, "ax" 99: movl $0, (%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro PTGS_TO_GS @@ -164,10 +163,8 @@ .pushsection .fixup, "ax" 99: movl $0, PT_GS(%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro GS_TO_REG reg @@ -249,12 +246,10 @@ jmp 2b 6: movl $0, (%esp) jmp 3b -.section __ex_table, "a" - .align 4 - .long 1b, 4b - .long 2b, 5b - .long 3b, 6b .popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) POP_GS_EX .endm @@ -415,10 +410,7 @@ sysenter_past_esp: jae syscall_fault 1: movl (%ebp),%ebp movl %ebp,PT_EBP(%esp) -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous + _ASM_EXTABLE(1b,syscall_fault) GET_THREAD_INFO(%ebp) @@ -485,10 +477,8 @@ sysexit_audit: .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b -.section __ex_table,"a" - .align 4 - .long 1b,2b .popsection + _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) @@ -543,10 +533,7 @@ ENTRY(iret_exc) pushl $do_iret_error jmp error_code .previous -.section __ex_table,"a" - .align 4 - .long irq_return,iret_exc -.previous + _ASM_EXTABLE(irq_return,iret_exc) CFI_RESTORE_STATE ldt_ss: @@ -901,10 +888,7 @@ END(device_not_available) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret -.section __ex_table,"a" - .align 4 - .long native_iret, iret_exc -.previous + _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) ENTRY(native_irq_enable_sysexit) @@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback) movl %eax,16(%esp) jmp 4b .previous -.section __ex_table,"a" - .align 4 - .long 1b,6b - .long 2b,7b - .long 3b,8b - .long 4b,9b -.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 734ebd1d3caa..320852d02026 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <asm/asm.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -481,7 +482,12 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja badsys movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative @@ -595,7 +601,12 @@ tracesys: */ LOAD_ARGS ARGOFFSET, 1 RESTORE_REST +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) @@ -735,6 +746,40 @@ ENTRY(stub_rt_sigreturn) CFI_ENDPROC END(stub_rt_sigreturn) +#ifdef CONFIG_X86_X32_ABI + PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx + +ENTRY(stub_x32_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys32_x32_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_rt_sigreturn) + +ENTRY(stub_x32_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys32_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execve) + +#endif + /* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a @@ -856,18 +901,12 @@ restore_args: irq_return: INTERRUPT_RETURN - - .section __ex_table, "a" - .quad irq_return, bad_iret - .previous + _ASM_EXTABLE(irq_return, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq - - .section __ex_table,"a" - .quad native_iret, bad_iret - .previous + _ASM_EXTABLE(native_iret, bad_iret) #endif .section .fixup,"ax" @@ -1137,10 +1176,7 @@ gs_change: CFI_ENDPROC END(native_load_gs_index) - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous + _ASM_EXTABLE(gs_change,bad_gs) .section .fixup,"ax" /* running with kernelgs */ bad_gs: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272fd..32ff36596ab1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,40 +24,21 @@ #include <trace/syscall.h> #include <asm/cacheflush.h> +#include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> -#include <asm/nmi.h> - #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); - int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); set_all_modules_text_rw(); - modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { - modifying_code = 0; set_all_modules_text_ro(); set_kernel_text_ro(); return 0; @@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - * and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - * we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status; /* holds return value of text write */ -static void *mod_code_ip; /* holds the IP to write to */ -static const void *mod_code_newcode; /* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ - int r; - - r = snprintf(buf, size, "%u %u", - nmi_wait_count, - atomic_read(&nmi_update_count)); - return r; -} - -static void clear_mod_flag(void) -{ - int old = atomic_read(&nmi_running); - - for (;;) { - int new = old & ~MOD_CODE_WRITE_FLAG; - - if (old == new) - break; - - old = atomic_cmpxchg(&nmi_running, old, new); - } -} - -static void ftrace_mod_code(void) -{ - /* - * Yes, more than one CPU process can be writing to mod_code_status. - * (and the code itself) - * But if one were to fail, then they all should, and if one were - * to succeed, then they all should. - */ - mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, - MCOUNT_INSN_SIZE); - - /* if we fail, then kill any new writers */ - if (mod_code_status) - clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ - __this_cpu_write(save_modifying_code, modifying_code); - - if (!__this_cpu_read(save_modifying_code)) - return; - - if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { - smp_rmb(); - ftrace_mod_code(); - atomic_inc(&nmi_update_count); - } - /* Must have previous changes seen before executions */ - smp_mb(); -} - -void ftrace_nmi_exit(void) -{ - if (!__this_cpu_read(save_modifying_code)) - return; - - /* Finish all executions before clearing nmi_running */ - smp_mb(); - atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ - if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) - return; - - do { - cpu_relax(); - } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - - nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ - if (!atomic_read(&nmi_running)) - return; - - do { - cpu_relax(); - } while (atomic_read(&nmi_running)); - - nmi_wait_count++; -} - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa(ip)); - mod_code_ip = (void *)ip; - mod_code_newcode = new_code; - - /* The buffers need to be visible before we let NMIs write them */ - smp_mb(); - - wait_for_nmi_and_set_mod_flag(); - - /* Make sure all running NMIs have finished before we write the code */ - smp_mb(); - - ftrace_mod_code(); - - /* Make sure the write happens before clearing the bit */ - smp_mb(); - - clear_mod_flag(); - wait_for_nmi(); - - return mod_code_status; + return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } static const unsigned char *ftrace_nop_replace(void) @@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + if (WARN_ON_ONCE(!regs)) + return 0; + + if (!ftrace_location(regs->ip - 1)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + + return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + if (ftrace_write(ip, &brk, 1)) + return -EPERM; + + return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return -1; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = (unsigned long)FTRACE_ADDR; + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) + return -EPERM; + return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + + return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + ftrace_bug(ret, rec ? rec->ip : 0); + printk(KERN_WARNING "Failed on %s (%d):\n", report, count); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + remove_breakpoint(rec); + } +} + +void arch_ftrace_update_code(int command) +{ + modifying_ftrace_code++; + + ftrace_modify_all_code(command); + + modifying_ftrace_code--; +} + int __init ftrace_dyn_arch_init(void *data) { /* The return code is retured via data */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085e..463c9797ca6a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -21,6 +21,7 @@ #include <asm/msr-index.h> #include <asm/cpufeature.h> #include <asm/percpu.h> +#include <asm/nops.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -363,28 +364,23 @@ default_entry: pushl $0 popfl -#ifdef CONFIG_SMP - cmpb $0, ready - jnz checkCPUtype -#endif /* CONFIG_SMP */ - /* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ - call setup_idt - -checkCPUtype: - - movl $-1,X86_CPUID # -1 for no CPUID initially - + movl setup_once_ref,%eax + andl %eax,%eax + jz 1f # Did we do this already? + call *%eax +1: + /* check if it is 486 or 386. */ /* * XXX - this does a lot of unnecessary setup. Alignment checks don't * apply at our cpl of 0 and the stack ought to be aligned already, and * we don't need to preserve eflags. */ - + movl $-1,X86_CPUID # -1 for no CPUID initially movb $3,X86 # at least 386 pushfl # push EFLAGS popl %eax # get EFLAGS @@ -450,21 +446,6 @@ is386: movl $2,%ecx # set MP movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu -#ifdef CONFIG_CC_STACKPROTECTOR - /* - * The linker can't handle this by relocation. Manually set - * base address in stack canary segment descriptor. - */ - cmpb $0,ready - jne 1f - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -1: -#endif movl $(__KERNEL_STACK_CANARY),%eax movl %eax,%gs @@ -473,7 +454,6 @@ is386: movl $2,%ecx # set MP cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder - movb $1, ready jmp *(initial_code) /* @@ -495,81 +475,122 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret + +#include "verify_cpu.S" + /* - * setup_idt + * setup_once * - * sets up a idt with 256 entries pointing to - * ignore_int, interrupt gates. It doesn't actually load - * idt - that can be done only after paging has been enabled - * and the kernel moved to PAGE_OFFSET. Interrupts - * are enabled elsewhere, when we can be relatively - * sure everything is ok. + * The setup work we only want to run on the BSP. * * Warning: %esi is live across this function. */ -setup_idt: - lea ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ +__INIT +setup_once: + /* + * Set up a idt with 256 entries pointing to ignore_int, + * interrupt gates. It doesn't actually load idt - that needs + * to be done on each CPU. Interrupts are enabled elsewhere, + * when we can be relatively sure everything is ok. + */ - lea idt_table,%edi - mov $256,%ecx -rp_sidt: + movl $idt_table,%edi + movl $early_idt_handlers,%eax + movl $NUM_EXCEPTION_VECTORS,%ecx +1: movl %eax,(%edi) - movl %edx,4(%edi) + movl %eax,4(%edi) + /* interrupt gate, dpl=0, present */ + movl $(0x8E000000 + __KERNEL_CS),2(%edi) + addl $9,%eax addl $8,%edi - dec %ecx - jne rp_sidt + loop 1b -.macro set_early_handler handler,trapno - lea \handler,%edx + movl $256 - NUM_EXCEPTION_VECTORS,%ecx + movl $ignore_int,%edx movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax + movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi - movl %eax,8*\trapno(%edi) - movl %edx,8*\trapno+4(%edi) -.endm +2: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + loop 2b - set_early_handler handler=early_divide_err,trapno=0 - set_early_handler handler=early_illegal_opcode,trapno=6 - set_early_handler handler=early_protection_fault,trapno=13 - set_early_handler handler=early_page_fault,trapno=14 +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * Configure the stack canary. The linker can't handle this by + * relocation. Manually set base address in stack canary + * segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + andl $0,setup_once_ref /* Once is enough, thanks */ ret -early_divide_err: - xor %edx,%edx - pushl $0 /* fake errcode */ - jmp early_fault +ENTRY(early_idt_handlers) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler + i = i + 1 + .endr +ENDPROC(early_idt_handlers) + + /* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) + cld + cmpl $2,%ss:early_recursion_flag + je hlt_loop + incl %ss:early_recursion_flag -early_illegal_opcode: - movl $6,%edx - pushl $0 /* fake errcode */ - jmp early_fault + push %eax # 16(%esp) + push %ecx # 12(%esp) + push %edx # 8(%esp) + push %ds # 4(%esp) + push %es # 0(%esp) + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es -early_protection_fault: - movl $13,%edx - jmp early_fault + cmpl $(__KERNEL_CS),32(%esp) + jne 10f -early_page_fault: - movl $14,%edx - jmp early_fault + leal 28(%esp),%eax # Pointer to %eip + call early_fixup_exception + andl %eax,%eax + jnz ex_entry /* found an exception entry */ -early_fault: - cld +10: #ifdef CONFIG_PRINTK - pusha - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - cmpl $2,early_recursion_flag - je hlt_loop - incl early_recursion_flag + xorl %eax,%eax + movw %ax,2(%esp) /* clean up the segment values on some cpus */ + movw %ax,6(%esp) + movw %ax,34(%esp) + leal 40(%esp),%eax + pushl %eax /* %esp before the exception */ + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi movl %cr2,%eax pushl %eax - pushl %edx /* trapno */ + pushl (20+6*4)(%esp) /* trapno */ pushl $fault_msg call printk #endif @@ -578,6 +599,17 @@ hlt_loop: hlt jmp hlt_loop +ex_entry: + pop %es + pop %ds + pop %edx + pop %ecx + pop %eax + addl $8,%esp /* drop vector number and error code */ + decl %ss:early_recursion_flag + iret +ENDPROC(early_idt_handler) + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -611,13 +643,18 @@ ignore_int: popl %eax #endif iret +ENDPROC(ignore_int) +__INITDATA + .align 4 +early_recursion_flag: + .long 0 -#include "verify_cpu.S" - - __REFDATA -.align 4 +__REFDATA + .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(setup_once_ref) + .long setup_once /* * BSS section @@ -670,22 +707,19 @@ ENTRY(initial_page_table) ENTRY(stack_start) .long init_thread_union+THREAD_SIZE -early_recursion_flag: - .long 0 - -ready: .byte 0 - +__INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" fault_msg: /* fault info: */ .ascii "BUG: Int %d: CR2 %p\n" -/* pusha regs: */ - .ascii " EDI %p ESI %p EBP %p ESP %p\n" - .ascii " EBX %p EDX %p ECX %p EAX %p\n" +/* regs pushed in early_idt_handler: */ + .ascii " EDI %p ESI %p EBP %p EBX %p\n" + .ascii " ESP %p ES %p DS %p\n" + .ascii " EDX %p ECX %p EAX %p\n" /* fault frame: */ - .ascii " err %p EIP %p CS %p flg %p\n" + .ascii " vec %p err %p EIP %p CS %p flg %p\n" .ascii "Stack: %p %p %p %p %p %p %p %p\n" .ascii " %p %p %p %p %p %p %p %p\n" .asciz " %p %p %p %p %p %p %p %p\n" @@ -699,6 +733,7 @@ fault_msg: * segment size, and 32-bit linear address value: */ + .data .globl boot_gdt_descr .globl idt_descr diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d1..7a40f2447321 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,12 +19,15 @@ #include <asm/cache.h> #include <asm/processor-flags.h> #include <asm/percpu.h> +#include <asm/nops.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> #include <asm/paravirt.h> +#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else -#define GET_CR2_INTO_RCX movq %cr2, %rcx +#define GET_CR2_INTO(reg) movq %cr2, reg +#define INTERRUPT_RETURN iretq #endif /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -270,36 +273,56 @@ bad_address: jmp bad_address .section ".init.text","ax" -#ifdef CONFIG_EARLY_PRINTK .globl early_idt_handlers early_idt_handlers: + # 104(%rsp) %rflags + # 96(%rsp) %cs + # 88(%rsp) %rip + # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - movl $i, %esi + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushq $0 # Dummy error code, to make stack frame uniform + .endif + pushq $i # 72(%rsp) Vector number jmp early_idt_handler i = i + 1 .endr -#endif ENTRY(early_idt_handler) -#ifdef CONFIG_EARLY_PRINTK + cld + cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) - GET_CR2_INTO_RCX - movq %rcx,%r9 - xorl %r8d,%r8d # zero for error code - movl %esi,%ecx # get vector number - # Test %ecx against mask of vectors that push error code. - cmpl $31,%ecx - ja 0f - movl $1,%eax - salq %cl,%rax - testl $0x27d00,%eax - je 0f - popq %r8 # get error code -0: movq 0(%rsp),%rcx # get ip - movq 8(%rsp),%rdx # get cs + + pushq %rax # 64(%rsp) + pushq %rcx # 56(%rsp) + pushq %rdx # 48(%rsp) + pushq %rsi # 40(%rsp) + pushq %rdi # 32(%rsp) + pushq %r8 # 24(%rsp) + pushq %r9 # 16(%rsp) + pushq %r10 # 8(%rsp) + pushq %r11 # 0(%rsp) + + cmpl $__KERNEL_CS,96(%rsp) + jne 10f + + leaq 88(%rsp),%rdi # Pointer to %rip + call early_fixup_exception + andl %eax,%eax + jnz 20f # Found an exception entry + +10: +#ifdef CONFIG_EARLY_PRINTK + GET_CR2_INTO(%r9) # can clobber any volatile register if pv + movl 80(%rsp),%r8d # error code + movl 72(%rsp),%esi # vector number + movl 96(%rsp),%edx # %cs + movq 88(%rsp),%rcx # %rip xorl %eax,%eax leaq early_idt_msg(%rip),%rdi call early_printk @@ -308,17 +331,32 @@ ENTRY(early_idt_handler) call dump_stack #ifdef CONFIG_KALLSYMS leaq early_idt_ripmsg(%rip),%rdi - movq 0(%rsp),%rsi # get rip again + movq 40(%rsp),%rsi # %rip again call __print_symbol #endif #endif /* EARLY_PRINTK */ 1: hlt jmp 1b -#ifdef CONFIG_EARLY_PRINTK +20: # Exception table entry found + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %rax + addq $16,%rsp # drop vector number and error code + decl early_recursion_flag(%rip) + INTERRUPT_RETURN + + .balign 4 early_recursion_flag: .long 0 +#ifdef CONFIG_EARLY_PRINTK early_idt_msg: .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7734bcbb5a3a..f250431fb505 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -88,7 +88,7 @@ void kernel_fpu_begin(void) __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else { - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); clts(); } } @@ -235,6 +235,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (HAVE_HWFP && tsk == current) unlazy_fpu(tsk); + tsk->thread.fpu.last_cpu = ~0; return 0; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 610485223bdb..36d1853e91af 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,7 +15,6 @@ #include <linux/delay.h> #include <linux/atomic.h> -#include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c deleted file mode 100644 index 43e9ccf44947..000000000000 --- a/arch/x86/kernel/init_task.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -/* - * Initial thread structure. - * - * We need to make sure that this is THREAD_SIZE aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); -EXPORT_SYMBOL(init_task); - -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data..cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7943e0c21bde..3dafc6003b7c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -282,8 +282,13 @@ void fixup_irqs(void) else if (!(warned++)) set_affinity = 0; + /* + * We unmask if the irq was not marked masked by the + * core code. That respects the lazy irq disable + * behaviour. + */ if (!irqd_can_move_in_process_context(data) && - !irqd_irq_disabled(data) && chip->irq_unmask) + !irqd_irq_masked(data) && chip->irq_unmask) chip->irq_unmask(data); raw_spin_unlock(&desc->lock); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 58b7f27cb3e9..344faf8d0d62 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu) return; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; @@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 43e2b1cff0a7..252981afd6c4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,7 +16,6 @@ #include <linux/delay.h> #include <linux/atomic.h> -#include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> @@ -61,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0, 0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error(get_irq_regs(), 0, 16); + math_error(get_irq_regs(), 0, X86_TRAP_MF); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 90fcf62854bb..1d5d31ea686b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, return count; } -static int setup_data_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static const struct file_operations fops_setup_data = { .read = setup_data_read, - .open = setup_data_open, + .open = simple_open, .llseek = default_llseek, }; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index fdc37b3d0ce3..8bfb6146f753 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,10 +43,11 @@ #include <linux/smp.h> #include <linux/nmi.h> #include <linux/hw_breakpoint.h> +#include <linux/uaccess.h> +#include <linux/memory.h> #include <asm/debugreg.h> #include <asm/apicdef.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/nmi.h> @@ -742,6 +743,64 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + char opc[BREAK_INSTR_SIZE]; + + bpt->type = BP_BREAKPOINT; + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); +#ifdef CONFIG_DEBUG_RODATA + if (!err) + return err; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + return -EBUSY; + text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err) + return err; + if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) + return -EINVAL; + bpt->type = BP_POKE_BREAKPOINT; +#endif /* CONFIG_DEBUG_RODATA */ + return err; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ +#ifdef CONFIG_DEBUG_RODATA + int err; + char opc[BREAK_INSTR_SIZE]; + + if (bpt->type != BP_POKE_BREAKPOINT) + goto knl_write; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + goto knl_write; + text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) + goto knl_write; + return err; +knl_write: +#endif /* CONFIG_DEBUG_RODATA */ + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + struct kgdb_arch arch_kgdb_ops = { /* Breakpoint instruction: */ .gdb_bpt_instr = { 0xcc }, diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e213fc8408d2..e2f751efb7b1 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) "current sp %p does not match saved sp %p\n", stack_addr(regs), kcb->jprobe_saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); + show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); - show_registers(regs); + show_regs(regs); BUG(); } *regs = kcb->jprobe_saved_regs; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 694d801bf606..e554e5ad2fe8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -38,6 +38,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> +#include <asm/idle.h> static int kvmapf = 1; @@ -78,7 +79,6 @@ struct kvm_task_sleep_node { u32 token; int cpu; bool halted; - struct mm_struct *mm; }; static struct kvm_task_sleep_head { @@ -125,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.mm = current->active_mm; n.halted = idle || preempt_count() > 1; - atomic_inc(&n.mm->mm_count); init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -160,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (!n->mm) - return; - mmdrop(n->mm); if (n->halted) smp_send_reschedule(n->cpu); else if (waitqueue_active(&n->wq)) @@ -206,7 +201,7 @@ again: * async PF was not yet handled. * Add dummy entry for the token. */ - n = kmalloc(sizeof(*n), GFP_ATOMIC); + n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { /* * Allocation failed! Busy wait while other cpu @@ -218,7 +213,6 @@ again: } n->token = token; n->cpu = smp_processor_id(); - n->mm = NULL; init_waitqueue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else @@ -253,7 +247,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) kvm_async_pf_task_wait((u32)read_cr2()); break; case KVM_PV_REASON_PAGE_READY: + rcu_irq_enter(); + exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); + rcu_irq_exit(); break; } } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b29..f8492da65bfc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,9 +201,11 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ea697263b373..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,7 +15,6 @@ #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/mmu_context.h> diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43ba5d3b..5b19e4d78b00 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,6 @@ #include <asm/apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/system.h> #include <asm/cacheflush.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 177183cbb6ae..7eb1e2b97827 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -43,7 +43,6 @@ #include <linux/mca.h> #include <linux/kprobes.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/mman.h> diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 73465aab28f8..8a2ce8fd41c0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -82,11 +82,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); - return -1; - } - csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); @@ -380,6 +375,13 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + patch = (void *)get_zeroed_page(GFP_KERNEL); if (!patch) return NULL; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 87a0f8688301..fbdfc6917180 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev, { unsigned long val; int cpu = dev->id; - int ret = 0; - char *end; + ssize_t ret = 0; - val = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (val == 1) { get_online_cpus(); @@ -419,10 +418,8 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&dev->kobj, &mc_attr_group); + if (microcode_init_cpu(cpu) == UCODE_ERROR) return -EINVAL; - } return err; } @@ -528,11 +525,11 @@ static int __init microcode_init(void) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) microcode_ops = init_amd_microcode(); - - if (!microcode_ops) { + else pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) return -ENODEV; - } microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3ca42d0e43a2..0327e2b3c408 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - pr_err("CPU%d not a capable Intel processor\n", cpu_num); - return -1; - } - csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { @@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + return µcode_intel_ops; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 925179f871de..f21fd94ac897 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -26,7 +26,6 @@ #include <linux/gfp.h> #include <linux/jump_label.h> -#include <asm/system.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 96356762a51d..eb113693f043 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,6 @@ #include <asm/processor.h> #include <asm/msr.h> -#include <asm/system.h> static struct class *msr_class; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..bffdfd48c1f2 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -31,14 +31,6 @@ #include <asm/nmi.h> #include <asm/x86_init.h> -#define NMI_MAX_NAMELEN 16 -struct nmiaction { - struct list_head list; - nmi_handler_t handler; - unsigned int flags; - char *name; -}; - struct nmi_desc { spinlock_t lock; struct list_head head; @@ -54,6 +46,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] = .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, }; @@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -107,11 +107,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, return handled; } -static int __setup_nmi(unsigned int type, struct nmiaction *action) +int __register_nmi_handler(unsigned int type, struct nmiaction *action) { struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; + if (!action->handler) + return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); /* @@ -120,6 +123,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) * to manage expectations */ WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); /* * some handlers need to be executed first otherwise a fake @@ -133,8 +138,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) spin_unlock_irqrestore(&desc->lock, flags); return 0; } +EXPORT_SYMBOL(__register_nmi_handler); -static struct nmiaction *__free_nmi(unsigned int type, const char *name) +void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *n; @@ -157,61 +163,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name) spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); - return (n); } - -int register_nmi_handler(unsigned int type, nmi_handler_t handler, - unsigned long nmiflags, const char *devname) -{ - struct nmiaction *action; - int retval = -ENOMEM; - - if (!handler) - return -EINVAL; - - action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); - if (!action) - goto fail_action; - - action->handler = handler; - action->flags = nmiflags; - action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); - if (!action->name) - goto fail_action_name; - - retval = __setup_nmi(type, action); - - if (retval) - goto fail_setup_nmi; - - return retval; - -fail_setup_nmi: - kfree(action->name); -fail_action_name: - kfree(action); -fail_action: - - return retval; -} -EXPORT_SYMBOL_GPL(register_nmi_handler); - -void unregister_nmi_handler(unsigned int type, const char *name) -{ - struct nmiaction *a; - - a = __free_nmi(type, name); - if (a) { - kfree(a->name); - kfree(a); - } -} - EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs, false)) + return; + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -236,15 +197,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs, false)) + return; + pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - show_registers(regs); + show_regs(regs); if (panic_on_io_nmi) panic("NMI IOCK error: Not continuing"); @@ -263,7 +228,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -305,7 +270,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510fa..e31bf8d5c4d2 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -13,6 +13,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/percpu.h> #include <asm/apic.h> #include <asm/nmi.h> @@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk("FAILED |"); + printk(KERN_CONT "FAILED |"); else if (nmi_fail == TIMEOUT) - printk("TIMEOUT|"); + printk(KERN_CONT "TIMEOUT|"); else - printk("ERROR |"); + printk(KERN_CONT "ERROR |"); dump_stack(); } else { testcase_successes++; - printk(" ok |"); + printk(KERN_CONT " ok |"); } testcase_total++; @@ -150,10 +151,10 @@ void __init nmi_selftest(void) print_testname("remote IPI"); dotest(remote_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); print_testname("local IPI"); dotest(local_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9c57c02e54f6..9ce885996fd7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -38,6 +38,7 @@ #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> +#include <asm/special_insns.h> /* nop stub */ void _paravirt_nop(void) @@ -240,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - percpu_write(paravirt_lazy_mode, mode); + this_cpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != mode); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); - percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -266,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); - if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } @@ -288,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return percpu_read(paravirt_lazy_mode); + return this_cpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b58345..b72838bae64a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -42,7 +42,6 @@ #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> -#include <asm/system.h> #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> @@ -431,7 +430,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { void *ret = NULL; dma_addr_t mapping; @@ -464,7 +463,8 @@ error: } static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); @@ -477,8 +477,8 @@ static void calgary_free_coherent(struct device *dev, size_t size, } static struct dma_map_ops calgary_dma_ops = { - .alloc_coherent = calgary_alloc_coherent, - .free_coherent = calgary_free_coherent, + .alloc = calgary_alloc_coherent, + .free = calgary_free_coherent, .map_sg = calgary_map_sg, .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, @@ -1480,8 +1480,9 @@ cleanup: static int __init calgary_parse_options(char *p) { unsigned int bridge; + unsigned long val; size_t len; - char* endp; + ssize_t ret; while (*p) { if (!strncmp(p, "64k", 3)) @@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p) ++p; if (*p == '\0') break; - bridge = simple_strtoul(p, &endp, 0); - if (p == endp) + ret = kstrtoul(p, 0, &val); + if (ret) break; + bridge = val; if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB %#x\n", bridge); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 28e5e06fcba4..3003250ac51d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void) } } void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long dma_mask; struct page *page; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3af4af810c07..f96050685b46 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, } static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { free_pages((unsigned long)vaddr, get_order(size)); } @@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev, } struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, + .alloc = dma_generic_alloc_coherent, + .free = nommu_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 8f972cbddef0..6c483ba98b9c 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -15,21 +15,30 @@ int swiotlb __read_mostly; static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *vaddr; - vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); + vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, + attrs); if (vaddr) return vaddr; return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } +static void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + static struct dma_map_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, - .alloc_coherent = x86_swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, + .alloc = x86_swiotlb_alloc_coherent, + .free = x86_swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1f..735279e54e59 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -12,10 +12,12 @@ #include <linux/user-return-notifier.h> #include <linux/dmi.h> #include <linux/utsname.h> +#include <linux/stackprotector.h> +#include <linux/tick.h> +#include <linux/cpuidle.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> #include <asm/idle.h> @@ -23,14 +25,47 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/debugreg.h> +#include <asm/nmi.h> + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; + unlazy_fpu(src); + *dst = *src; if (fpu_allocated(&src->thread.fpu)) { memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); @@ -47,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk) fpu_free(&tsk->thread.fpu); } -void free_thread_info(struct thread_info *ti) +void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(ti->task); - free_pages((unsigned long)ti, THREAD_ORDER); + free_thread_xstate(tsk); } void arch_task_cache_init(void) @@ -61,6 +95,16 @@ void arch_task_cache_init(void) SLAB_PANIC | SLAB_NOTRACK, NULL); } +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + /* * Free current thread data structures etc.. */ @@ -83,12 +127,8 @@ void exit_thread(void) put_cpu(); kfree(bp); } -} -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); + drop_fpu(me); } void show_regs_common(void) @@ -123,12 +163,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); + drop_fpu(tsk); } static void hard_disable_TSC(void) @@ -342,36 +377,105 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); #endif -#ifdef CONFIG_X86_32 -/* - * This halt magic was a workaround for ancient floppy DMA - * wreckage. It should be safe to remove. - */ -static int hlt_counter; -void disable_hlt(void) +static inline int hlt_use_halt(void) { - hlt_counter++; + return 1; } -EXPORT_SYMBOL(disable_hlt); -void enable_hlt(void) +#ifndef CONFIG_SMP +static inline void play_dead(void) { - hlt_counter--; + BUG(); } -EXPORT_SYMBOL(enable_hlt); +#endif -static inline int hlt_use_halt(void) +#ifdef CONFIG_X86_64 +void enter_idle(void) { - return (!hlt_counter && boot_cpu_data.hlt_works_ok); + this_cpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } -#else -static inline int hlt_use_halt(void) + +static void __exit_idle(void) { - return 1; + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); } #endif /* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; + + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_touch_nmi(); + local_irq_disable(); + + enter_idle(); + + /* Don't trace irqs off for idle */ + stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + + if (cpuidle_idle_call()) + pm_idle(); + + rcu_idle_exit(); + start_critical_timings(); + + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +/* * We use this if we don't have any better * idle routine.. */ @@ -427,26 +531,6 @@ void stop_this_cpu(void *dummy) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { @@ -505,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + /* Use mwait if idle=mwait boot option is given */ if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; + /* + * Any idle= boot option other than idle=mwait means that we must not + * use mwait. Eg: idle=halt or idle=poll or idle=nomwait + */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return 0; + if (c->cpuid_level < MWAIT_INFO) return 0; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfaf..516fa186121b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,7 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> @@ -31,17 +30,14 @@ #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/personality.h> -#include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> -#include <linux/cpuidle.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/processor.h> #include <asm/i387.h> @@ -58,7 +54,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> -#include <asm/nmi.h> +#include <asm/switch_to.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -70,60 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - int cpu = smp_processor_id(); - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - current_thread_info()->status |= TS_POLLING; - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - while (!need_resched()) { - - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) - play_dead(); - - local_touch_nmi(); - local_irq_disable(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - if (cpuidle_idle_call()) - pm_idle(); - start_critical_timings(); - } - rcu_idle_exit(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -184,15 +126,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) @@ -360,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p, fpu); - percpu_write(current_task, next_p); + this_cpu_write(current_task, next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5aa..61cdf7fdf099 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,7 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> @@ -32,15 +31,12 @@ #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> -#include <linux/tick.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> -#include <linux/cpuidle.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/fpu-internal.h> @@ -52,114 +48,11 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> -#include <asm/nmi.h> +#include <asm/switch_to.h> asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); -static DEFINE_PER_CPU(unsigned char, is_idle); - -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - -void enter_idle(void) -{ - percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} - -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - current_thread_info()->status |= TS_POLLING; - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - while (!need_resched()) { - - rmb(); - - if (cpu_is_offline(smp_processor_id())) - play_dead(); - /* - * Idle routines should keep interrupts disabled - * from here on, until they go to idle. - * Otherwise, idle callbacks can misfire. - */ - local_touch_nmi(); - local_irq_disable(); - enter_idle(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - - /* enter_idle() needs rcu for notifiers */ - rcu_idle_enter(); - - if (cpuidle_idle_call()) - pm_idle(); - - rcu_idle_exit(); - start_critical_timings(); - - /* In many cases the interrupt that ended idle - has already called exit_idle. But some idle - loops can be woken up without interrupt. */ - __exit_idle(); - } - - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -252,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(&t->thread.tls_array[tls]); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) @@ -344,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - percpu_write(old_rsp, new_sp); + this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -365,7 +249,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, - __USER32_CS, __USER32_DS, __USER32_DS); + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); } #endif @@ -464,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = percpu_read(old_rsp); - percpu_write(old_rsp, next->usersp); - percpu_write(current_task, next_p); + prev->usersp = this_cpu_read(old_rsp); + this_cpu_write(old_rsp, next->usersp); + this_cpu_write(current_task, next_p); - percpu_write(kernel_stack, + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); @@ -488,6 +374,8 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -500,21 +388,33 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(void) +void set_personality_ia32(bool x32) { /* inherit personality from parent */ /* Make sure to be in 32bit mode */ - set_thread_flag(TIF_IA32); - current->personality |= force_personality32; + set_thread_flag(TIF_ADDR32); /* Mark the associated mm as containing 32-bit tasks. */ if (current->mm) current->mm->context.ia32_compat = 1; - /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + if (x32) { + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + current->personality &= ~READ_IMPLIES_EXEC; + /* is_compat_task() uses the presence of the x32 + syscall bit flag to determine compat status */ + current_thread_info()->status &= ~TS_COMPAT; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; + } } +EXPORT_SYMBOL_GPL(set_personality_ia32); unsigned long get_wchan(struct task_struct *p) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 78f05e438be5..13b1990c7c58 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,6 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/fpu-internal.h> @@ -34,6 +33,7 @@ #include <asm/prctl.h> #include <asm/proto.h> #include <asm/hw_breakpoint.h> +#include <asm/traps.h> #include "tls.h" @@ -1131,6 +1131,100 @@ static int genregs32_set(struct task_struct *target, return ret; } +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + return do_arch_prctl(child, data, addr); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { @@ -1140,6 +1234,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, int ret; __u32 val; +#ifdef CONFIG_X86_X32_ABI + if (!is_ia32_task()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif + switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); @@ -1327,7 +1426,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, int error_code, int si_code, struct siginfo *info) { - tsk->thread.trap_no = 1; + tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; memset(info, 0, sizeof(*info)); @@ -1381,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - secure_computing(regs->orig_ax); + if (secure_computing(regs->orig_ax)) { + /* seccomp failures shouldn't expose any additional code. */ + ret = -1L; + goto out; + } if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; @@ -1406,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->dx, regs->r10); #endif +out: return ret ?: regs->orig_ax; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853c..77215c23fba1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,7 +39,8 @@ static int reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; -/* This variable is used privately to keep track of whether or not +/* + * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't * been set on the command line). This is needed so that we can * suppress DMI scanning for reboot quirks. Without it, it's @@ -51,7 +52,8 @@ static int reboot_default = 1; static int reboot_cpu = -1; #endif -/* This is set if we need to go through the 'emergency' path. +/* + * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on * an inconsistent state and won't be able to do a clean cleanup */ @@ -60,22 +62,24 @@ static int reboot_emergency; /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - bios Reboot by jumping through the BIOS (only for X86_32) - smp Reboot by executing reset on BSP or other CPU (only for X86_32) - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - acpi Use the RESET_REG in the FADT - efi Use efi reset_system runtime service - pci Use the so-called "PCI reset register", CF9 - force Avoid anything that could hang. +/* + * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] + * warm Don't set the cold reboot flag + * cold Set the cold reboot flag + * bios Reboot by jumping through the BIOS (only for X86_32) + * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * triple Force a triple fault (init) + * kbd Use the keyboard controller. cold reset (default) + * acpi Use the RESET_REG in the FADT + * efi Use efi reset_system runtime service + * pci Use the so-called "PCI reset register", CF9 + * force Avoid anything that could hang. */ static int __init reboot_setup(char *str) { for (;;) { - /* Having anything passed on the command line via + /* + * Having anything passed on the command line via * reboot= will cause us to disable DMI checking * below. */ @@ -98,9 +102,11 @@ static int __init reboot_setup(char *str) if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have detected BSP APIC ID or smp_num_cpu */ + /* + * We will leave sorting out the final value + * when we are ready to reboot, since we might not + * have detected BSP APIC ID or smp_num_cpu + */ break; #endif /* CONFIG_SMP */ @@ -150,6 +156,82 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; + +void machine_real_restart(unsigned int type) +{ + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; + + local_irq_disable(); + + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch back to the initial page table. + */ + load_cr3(initial_page_table); + + /* + * Write 0x1234 to absolute memory location 0x472. The BIOS reads + * this on booting to tell it to "Bypass memory test (also warm + * boot)". This seems like a fairly standard thing that gets set by + * REBOOT.COM programs, and the previous reset routine did this + * too. */ + *((unsigned short *)0x472) = reboot_mode; + + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +#endif /* CONFIG_X86_32 */ + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { @@ -159,7 +241,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) return 0; } +/* + * This is a single dmi_table handling all reboot quirks. Note that + * REBOOT_BIOS is only available for 32bit + */ static struct dmi_system_id __initdata reboot_dmi_table[] = { +#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -184,7 +271,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -192,7 +279,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -201,7 +288,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0MM599"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -210,7 +297,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 330", .matches = { @@ -219,7 +306,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 360", .matches = { @@ -228,7 +315,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, - { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 760", .matches = { @@ -301,7 +388,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, - { /* Handle problems with rebooting on ASUS P4S800 */ + { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, .ident = "ASUS P4S800", .matches = { @@ -309,7 +396,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, - { /* Handle reboot issue on Acer Aspire one */ +#endif /* CONFIG_X86_32 */ + + { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", .matches = { @@ -317,96 +406,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, - { } -}; - -static int __init reboot_init(void) -{ - /* Only do the DMI check if reboot_type hasn't been overridden - * on the command line - */ - if (reboot_default) { - dmi_check_system(reboot_dmi_table); - } - return 0; -} -core_initcall(reboot_init); - -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - -void machine_real_restart(unsigned int type) -{ - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; - - local_irq_disable(); - - /* Write zero to CMOS register number 0x0f, which the BIOS POST - routine will recognize as telling it to do a proper reboot. (Well - that's what this book in front of me says -- it may only apply to - the Phoenix BIOS though, it's not clear). At the same time, - disable NMIs by setting the top bit in the CMOS address register, - as we're about to do peculiar things to the CPU. I'm not sure if - `outb_p' is needed instead of just `outb'. Use it to be on the - safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) - */ - spin_lock(&rtc_lock); - CMOS_WRITE(0x00, 0x8f); - spin_unlock(&rtc_lock); - - /* - * Switch back to the initial page table. - */ - load_cr3(initial_page_table); - - /* Write 0x1234 to absolute memory location 0x472. The BIOS reads - this on booting to tell it to "Bypass memory test (also warm - boot)". This seems like a fairly standard thing that gets set by - REBOOT.COM programs, and the previous reset routine did this - too. */ - *((unsigned short *)0x472) = reboot_mode; - - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - - /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); -} -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(machine_real_restart); -#endif - -#endif /* CONFIG_X86_32 */ - -/* - * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot - */ -static int __init set_pci_reboot(const struct dmi_system_id *d) -{ - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); - } - return 0; -} - -static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, .ident = "Apple MacBook5", @@ -474,17 +473,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { } }; -static int __init pci_reboot_init(void) +static int __init reboot_init(void) { - /* Only do the DMI check if reboot_type hasn't been overridden + /* + * Only do the DMI check if reboot_type hasn't been overridden * on the command line */ - if (reboot_default) { - dmi_check_system(pci_reboot_dmi_table); - } + if (reboot_default) + dmi_check_system(reboot_dmi_table); return 0; } -core_initcall(pci_reboot_init); +core_initcall(reboot_init); static inline void kb_wait(void) { @@ -502,14 +501,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs) cpu_emergency_vmxoff(); } -/* Use NMIs as IPIs to tell all CPUs to disable virtualization - */ +/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ static void emergency_vmx_disable_all(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); - /* We need to disable VMX on all CPUs before rebooting, otherwise + /* + * We need to disable VMX on all CPUs before rebooting, otherwise * we risk hanging up the machine, because the CPU ignore INIT * signals when VMX is enabled. * @@ -528,8 +527,7 @@ static void emergency_vmx_disable_all(void) * is still enabling VMX. */ if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. - */ + /* Disable VMX on this CPU. */ cpu_vmxoff(); /* Halt and disable VMX on the other CPUs */ @@ -574,12 +572,12 @@ static void native_machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: - mach_reboot_fixups(); /* for board specific fixups */ + mach_reboot_fixups(); /* For board specific fixups */ for (i = 0; i < 10; i++) { kb_wait(); udelay(50); - outb(0xfe, 0x64); /* pulse reset low */ + outb(0xfe, 0x64); /* Pulse reset low */ udelay(50); } if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { @@ -621,7 +619,7 @@ static void native_machine_emergency_restart(void) case BOOT_CF9: port_cf9_safe = true; - /* fall through */ + /* Fall through */ case BOOT_CF9_COND: if (port_cf9_safe) { @@ -659,7 +657,8 @@ void native_machine_shutdown(void) /* Make certain I only run on the appropriate processor */ set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* O.K Now that I'm on the appropriate processor, + /* + * O.K Now that I'm on the appropriate processor, * stop all of the others. */ stop_other_cpus(); @@ -697,12 +696,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { - /* stop other cpus and apics */ + /* Stop other cpus and apics */ machine_shutdown(); tboot_shutdown(TB_SHUTDOWN_HALT); - /* stop this cpu */ stop_this_cpu(NULL); } @@ -713,7 +711,7 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } - /* a fallback in case there is no PM info available */ + /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); } @@ -775,7 +773,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) cpu = raw_smp_processor_id(); - /* Don't do anything if this handler is invoked on crashing cpu. + /* + * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ @@ -799,7 +798,8 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -/* Halt all other CPUs, calling the specified function on each of them +/* + * Halt all other CPUs, calling the specified function on each of them * * This function can be used to halt all other CPUs on crash * or emergency reboot time. The function passed as parameter @@ -810,7 +810,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) unsigned long msecs; local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); shootdown_callback = callback; @@ -819,8 +819,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Would it be better to replace the trap vector here? */ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, NMI_FLAG_FIRST, "crash")) - return; /* return what? */ - /* Ensure the new callback function is set before sending + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending * out the NMI */ wmb(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..9b4204e06665 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -90,7 +90,6 @@ #include <asm/processor.h> #include <asm/bugs.h> -#include <asm/system.h> #include <asm/vsyscall.h> #include <asm/cpu.h> #include <asm/desc.h> @@ -394,10 +393,9 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; + panic("initrd too large to handle, " + "disabling initrd (%lld needed, %lld available)\n", + ramdisk_size, end_of_lowmem>>1); } printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, @@ -509,15 +507,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -536,7 +525,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); @@ -1022,7 +1011,8 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - ioapic_and_gsi_init(); + if (x86_io_apic_ops.init) + x86_io_apic_ops.init(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 71f4727da373..5a98aa272184 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void) #endif rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { - const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 25edcfc9ba5b..115eac431483 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -10,10 +10,8 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> -#include <linux/signal.h> #include <linux/errno.h> #include <linux/wait.h> -#include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/unistd.h> #include <linux/stddef.h> @@ -27,10 +25,12 @@ #include <asm/fpu-internal.h> #include <asm/vdso.h> #include <asm/mce.h> +#include <asm/sighandling.h> #ifdef CONFIG_X86_64 #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/sys_ia32.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -38,13 +38,6 @@ #include <asm/sigframe.h> -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - #ifdef CONFIG_X86_32 # define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) #else @@ -69,9 +62,8 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) { void __user *buf; unsigned int tmpflags; @@ -126,9 +118,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) { int err = 0; @@ -160,7 +151,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->r15, &sc->r15); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_no, &sc->trapno); + put_user_ex(current->thread.trap_nr, &sc->trapno); put_user_ex(current->thread.error_code, &sc->err); put_user_ex(regs->ip, &sc->ip); #ifdef CONFIG_X86_32 @@ -643,6 +634,16 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +#ifdef CONFIG_X86_X32_ABI +#define is_x32 test_thread_flag(TIF_X32) + +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs); +#else /* !CONFIG_X86_X32_ABI */ +#define is_x32 0 +#endif /* CONFIG_X86_X32_ABI */ + int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs); int ia32_setup_frame(int sig, struct k_sigaction *ka, @@ -667,8 +668,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else ret = ia32_setup_frame(usig, ka, set, regs); - } else +#ifdef CONFIG_X86_X32_ABI + } else if (is_x32) { + ret = x32_setup_rt_frame(usig, ka, info, + (compat_sigset_t *)set, regs); +#endif + } else { ret = __setup_rt_frame(sig, ka, info, set, regs); + } if (ret) { force_sigsegv(sig, current); @@ -851,3 +858,102 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV, me); } + +#ifdef CONFIG_X86_X32_ABI +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + put_user_ex(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + put_user_ex(0, &frame->uc.uc__pad0); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; +} + +asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long ax; + struct pt_regs tregs; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + tregs = *regs; + if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 66c74f481cab..48d2b7ded422 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -109,6 +109,9 @@ * about nothing of note with C stepping upwards. */ +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } -static atomic_t stopping_cpu = ATOMIC_INIT(-1); - static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ @@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -static void native_nmi_stop_other_cpus(int wait) +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + +static void native_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait) * Use an own vector here because smp_call_function * does lots of things not suitable in a panic situation. */ + + /* + * We start by using the REBOOT_VECTOR irq. + * The irq is treated as a sync point to allow critical + * regions of code on other cpus to release their spin locks + * and re-enable irqs. Jumping straight to an NMI might + * accidentally cause deadlocks with further shutdown/panic + * code. By syncing, we give the cpus up to one second to + * finish their work before we force them off with the NMI. + */ if (num_online_cpus() > 1) { /* did someone beat us here? */ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) return; - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - return; - - /* sync above data before sending NMI */ + /* sync above data before sending IRQ */ wmb(); - apic->send_IPI_allbutself(NMI_VECTOR); + apic->send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second if the caller @@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait) while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + /* Hope the REBOOT_IRQ is good enough */ + goto finish; - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); -} - -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ - -asmlinkage void smp_reboot_interrupt(void) -{ - ack_APIC_irq(); - irq_enter(); - stop_this_cpu(NULL); - irq_exit(); -} - -static void native_irq_stop_other_cpus(int wait) -{ - unsigned long flags; - unsigned long timeout; + /* sync above data before sending IRQ */ + wmb(); - if (reboot_force) - return; + pr_emerg("Shutting down cpus with NMI\n"); - /* - * Use an own vector here because smp_call_function - * does lots of things not suitable in a panic situation. - * On most systems we could also use an NMI here, - * but there are a few systems around where NMI - * is problematic so stay with an non NMI for now - * (this implies we cannot stop CPUs spinning with irq off - * currently) - */ - if (num_online_cpus() > 1) { - apic->send_IPI_allbutself(REBOOT_VECTOR); + apic->send_IPI_allbutself(NMI_VECTOR); /* - * Don't wait longer than a second if the caller + * Don't wait longer than a 10 ms if the caller * didn't ask us to wait. */ - timeout = USEC_PER_SEC; + timeout = USEC_PER_MSEC * 10; while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } +finish: local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); } -static void native_smp_disable_nmi_ipi(void) -{ - smp_ops.stop_other_cpus = native_irq_stop_other_cpus; -} - /* * Reschedule call back. */ @@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) static int __init nonmi_ipi_setup(char *str) { - native_smp_disable_nmi_ipi(); - return 1; + smp_no_nmi_ipi = true; + return 1; } __setup("nonmi_ipi", nonmi_ipi_setup); @@ -298,7 +290,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_nmi_stop_other_cpus, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e578a79a3093..433529e29be4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -50,6 +50,7 @@ #include <linux/tboot.h> #include <linux/stackprotector.h> #include <linux/gfp.h> +#include <linux/cpuidle.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -75,20 +76,8 @@ /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; -/* Store all idle threads, this can be reused instead of creating -* a new thread. Also avoids complicated thread destroy functionality -* for idle threads. -*/ #ifdef CONFIG_HOTPLUG_CPU /* - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is - * removed after init for !CONFIG_HOTPLUG_CPU. - */ -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); -#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) -#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) - -/* * We need this for trampoline_base protection from concurrent accesses when * off- and onlining cores wildly. */ @@ -96,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); void cpu_hotplug_driver_lock(void) { - mutex_lock(&x86_cpu_hotplug_driver_mutex); + mutex_lock(&x86_cpu_hotplug_driver_mutex); } void cpu_hotplug_driver_unlock(void) { - mutex_unlock(&x86_cpu_hotplug_driver_mutex); + mutex_unlock(&x86_cpu_hotplug_driver_mutex); } ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } -#else -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif /* Number of siblings per CPU package */ @@ -219,14 +204,9 @@ static void __cpuinit smp_callin(void) * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as * accurate as the value just calculated. - * - * Need to enable IRQs because it can take longer and then - * the NMI watchdog might kill us. */ - local_irq_enable(); calibrate_delay(); cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; - local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* @@ -255,6 +235,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -318,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +static bool __cpuinit +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); +} + +#define link_mask(_m, c1, c2) \ +do { \ + cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ + cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ +} while (0) + +static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && + c->compute_unit_id == o->compute_unit_id) + return topology_sane(c, o, "smt"); + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) + return topology_sane(c, o, "llc"); + + return false; } +static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return topology_sane(c, o, "mc"); + + return false; +} void __cpuinit set_cpu_sibling_map(int cpu) { - int i; + bool has_mc = boot_cpu_data.x86_max_cores > 1; + bool has_smt = smp_num_siblings > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); - if (smp_num_siblings > 1) { - for_each_cpu(i, cpu_sibling_setup_mask) { - struct cpuinfo_x86 *o = &cpu_data(i); - - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && - c->compute_unit_id == o->compute_unit_id) - link_thread_siblings(cpu, i); - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - link_thread_siblings(cpu, i); - } - } - } else { + if (!has_smt && !has_mc) { cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); - } - - cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - - if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { - cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); c->booted_cores = 1; return; } for_each_cpu(i, cpu_sibling_setup_mask) { - if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); - cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - } - if (c->phys_proc_id == cpu_data(i).phys_proc_id) { - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); + o = &cpu_data(i); + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(sibling, cpu, i); + + if ((i == cpu) || (has_mc && match_llc(c, o))) + link_mask(llc_shared, cpu, i); + + if ((i == cpu) || (has_mc && match_mc(c, o))) { + link_mask(core, cpu, i); + /* * Does this new cpu bringup a new core? */ @@ -401,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if ((sched_mc_power_savings || sched_smt_power_savings) && - !(cpu_has(c, X86_FEATURE_AMD_DCM))) + if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return cpu_llc_shared_mask(cpu); @@ -621,22 +632,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -struct create_idle { - struct work_struct work; - struct task_struct *idle; - struct completion done; - int cpu; -}; - -static void __cpuinit do_fork_idle(struct work_struct *work) -{ - struct create_idle *c_idle = - container_of(work, struct create_idle, work); - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void __cpuinit announce_cpu(int cpu, int apicid) { @@ -663,58 +658,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu) +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { unsigned long boot_error = 0; unsigned long start_ip; int timeout; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - - INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); - c_idle.idle = get_idle_for_cpu(cpu); + idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(idle))) - 1); + per_cpu(current_task, cpu) = idle; - /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. - */ - if (c_idle.idle) { - c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - destroy_work_on_stack(&c_idle.work); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); -do_rest: - per_cpu(current_task, cpu) = c_idle.idle; #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); #else - clear_tsk_thread_flag(c_idle.idle, TIF_FORK); + clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(c_idle.idle) - + (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = c_idle.idle->thread.sp; + stack_start = idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = trampoline_address(); @@ -816,12 +784,10 @@ do_rest: */ smpboot_restore_warm_reset_vector(); } - - destroy_work_on_stack(&c_idle.work); return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; @@ -854,7 +820,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - err = do_boot_cpu(apicid, cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1408,7 +1374,8 @@ void native_play_dead(void) tboot_shutdown(TB_SHUTDOWN_WFS); mwait_play_dead(); /* Only returns on failure */ - hlt_play_dead(); + if (cpuidle_play_dead()) + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ef59642ff1bf..b4d3c3927dd8 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -98,7 +98,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit @@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) + if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) && len <= mm->cached_hole_size) { mm->cached_hole_size = 0; mm->free_area_cache = begin; @@ -205,7 +205,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legact mmap base */ - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 7ac7943be02c..5c7f8c20da74 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,6 +5,14 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include <asm/syscalls_64.h> #undef __SYSCALL_64 diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e2410e27f97e..6410744ac5cb 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) offsetof(struct acpi_table_facs, firmware_waking_vector); } -void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) { static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { /* S0,1,2: */ -1, -1, -1, @@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) /* S5: */ TB_SHUTDOWN_S5 }; if (!tboot_enabled()) - return; + return 0; tboot_copy_fadt(&acpi_gbl_FADT); tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; @@ -292,10 +292,11 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { pr_warning("unsupported sleep state 0x%x\n", sleep_state); - return; + return -1; } tboot_shutdown(acpi_shutdown_map[sleep_state]); + return 0; } static atomic_t ap_wfs_count; @@ -345,6 +346,8 @@ static __init int tboot_late_init(void) atomic_set(&ap_wfs_count, 0); register_hotcpu_notifier(&tboot_cpu_notifier); + + acpi_os_set_prepare_sleep(&tboot_sleep); return 0; } diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 9e540fee7009..ab40954e113e 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -34,6 +34,7 @@ #include <asm/tce.h> #include <asm/calgary.h> #include <asm/proto.h> +#include <asm/cacheflush.h> /* flush a tce at 'tceaddr' to main memory */ static inline void flush_tce(void* tceaddr) diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index c29e235792af..b79133abda48 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <asm/cacheflush.h> #include <asm/sections.h> +#include <asm/asm.h> int rodata_test(void) { @@ -42,14 +43,7 @@ int rodata_test(void) ".section .fixup,\"ax\"\n" "2: jmp 1b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 16\n" -#ifdef CONFIG_X86_32 - " .long 0b,2b\n" -#else - " .quad 0b,2b\n" -#endif - ".previous" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) ); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e70..9d9d2f9e77a5 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -6,7 +6,6 @@ #include <asm/uaccess.h> #include <asm/desc.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> @@ -163,7 +162,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset, { const struct desc_struct *tls; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; @@ -198,7 +197,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec61d4c1b93b..92d5756d85fc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> -#include <asm/system.h> +#include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> @@ -120,7 +120,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ - if (trapnr < 6) + if (trapnr < X86_TRAP_UD) goto vm86_trap; goto trap_signal; } @@ -133,7 +133,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, trap_signal: #endif /* - * We want error_code and trap_no set for userspace faults and + * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the @@ -142,7 +142,7 @@ trap_signal: * delivered, faults. See also do_general_protection below. */ tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && @@ -165,7 +165,7 @@ trap_signal: kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -204,27 +204,31 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, + regs->ip) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, + regs->ip) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", + coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, + BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) + X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) return; preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); preempt_conditional_cli(regs); } @@ -234,10 +238,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) struct task_struct *tsk = current; /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; + tsk->thread.trap_nr = X86_TRAP_DF; /* * This is always a kernel trap and never fixable (and thus must @@ -265,7 +269,7 @@ do_general_protection(struct pt_regs *regs, long error_code) goto gp_in_kernel; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; + tsk->thread.trap_nr = X86_TRAP_GP; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { @@ -292,24 +296,29 @@ gp_in_kernel: return; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) + tsk->thread.trap_nr = X86_TRAP_GP; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) return; die("general protection fault", regs, error_code); } /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_DYNAMIC_FTRACE + /* ftrace must be first, everything else may cause a recursive crash */ + if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + return; +#endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; /* @@ -318,7 +327,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) */ debug_stack_usage_inc(); preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); } @@ -423,8 +432,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, 1); + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, + X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); return; @@ -461,7 +470,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) struct task_struct *task = current; siginfo_t info; unsigned short err; - char *str = (trapnr == 16) ? "fpu exception" : "simd exception"; + char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : + "simd exception"; if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; @@ -471,7 +481,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -481,12 +491,12 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) * Save the info for the exception handler and clear the error. */ save_init_fpu(task); - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; info.si_addr = (void __user *)regs->ip; - if (trapnr == 16) { + if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -530,10 +540,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) info.si_code = FPE_FLTRES; } else { /* - * If we're using IRQ 13, or supposedly even some trap 16 - * implementations, it's possible we get a spurious trap... + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. */ - return; /* Spurious trap, no error */ + return; } force_sig_info(SIGFPE, &info, task); } @@ -544,13 +555,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) ignore_fpu_irq = 1; #endif - math_error(regs, error_code, 16); + math_error(regs, error_code, X86_TRAP_MF); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - math_error(regs, error_code, 19); + math_error(regs, error_code, X86_TRAP_XF); } dotraplinkage void @@ -644,20 +655,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_errno = 0; info.si_code = ILL_BADSTK; info.si_addr = NULL; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); } #endif /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - set_intr_gate(14, &page_fault); + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_intr_gate(X86_TRAP_PF, &page_fault); load_idt(&idt_descr); } @@ -673,30 +685,30 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_intr_gate(0, ÷_error); - set_intr_gate_ist(2, &nmi, NMI_STACK); + set_intr_gate(X86_TRAP_DE, ÷_error); + set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); + set_system_intr_gate(X86_TRAP_OF, &overflow); + set_intr_gate(X86_TRAP_BR, &bounds); + set_intr_gate(X86_TRAP_UD, &invalid_op); + set_intr_gate(X86_TRAP_NM, &device_not_available); #ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); #else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); #endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); + set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); + set_intr_gate(X86_TRAP_TS, &invalid_TSS); + set_intr_gate(X86_TRAP_NP, &segment_not_present); + set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_GP, &general_protection); + set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); + set_intr_gate(X86_TRAP_MF, &coprocessor_error); + set_intr_gate(X86_TRAP_AC, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); + set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); #endif - set_intr_gate(19, &simd_coprocessor_error); + set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) @@ -721,7 +733,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_64 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(1, &debug); - set_nmi_gate(3, &int3); + set_nmi_gate(X86_TRAP_DB, &debug); + set_nmi_gate(X86_TRAP_BP, &int3); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 183c5925a9fe..fc0a147e3727 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -646,7 +646,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; @@ -933,6 +933,16 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } + + /* + * Trust the results of the earlier calibration on systems + * exporting a reliable TSC. + */ + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + clocksource_register_khz(&clocksource_tsc, tsc_khz); + return 0; + } + schedule_delayed_work(&tsc_irqwork, 0); return 0; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 328cb37bb827..255f58ae71e8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -569,7 +569,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) } if (trapno != 1) return 1; /* we let this handle by the calling routine */ - current->thread.trap_no = trapno; + current->thread.trap_nr = trapno; current->thread.error_code = error_code; force_sig(SIGTRAP, current); return 0; diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a1d804bcd483..8eeb55a551b4 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,6 +15,7 @@ #include <linux/init.h> #include <linux/pci_ids.h> #include <linux/pci_regs.h> +#include <linux/smp.h> #include <asm/apic.h> #include <asm/pci-direct.h> @@ -22,6 +23,8 @@ #include <asm/paravirt.h> #include <asm/setup.h> +#define TOPOLOGY_REGISTER_OFFSET 0x10 + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -149,12 +152,49 @@ int is_vsmp_box(void) return 0; } #endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) + void __iomem *address; + unsigned int cfg, topology, node_shift, maxcpus; + + /* + * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the + * ones present in the first board, unless explicitly overridden by + * setup_max_cpus + */ + if (setup_max_cpus != NR_CPUS) + return; + + /* Read the vSMP Foundation topology register */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); + if (WARN_ON(!address)) + return; + + topology = readl(address); + node_shift = (topology >> 16) & 0x7; + if (!node_shift) + /* The value 0 should be decoded as 8 */ + node_shift = 8; + maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + + pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", + maxcpus); + setup_max_cpus = maxcpus; + early_iounmap(address, 4); +#endif +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + vsmp_cap_cpus(); + set_vsmp_pv_ops(); return; } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba9393564..7515cf0e1805 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,10 +52,7 @@ #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = -{ - .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), -}; +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -80,20 +77,15 @@ early_param("vsyscall", vsyscall_setup); void update_vsyscall_tz(void) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - unsigned long flags; + struct timespec monotonic; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_seqcount_begin(&vsyscall_gtod_data.seq); /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -101,12 +93,19 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; + + monotonic = timespec_add(*wall_time, *wtm); + vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; + vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.monotonic_time_coarse = + timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_seqcount_end(&vsyscall_gtod_data.seq); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, @@ -153,7 +152,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->error_code = 6; /* user fault, no page, write */ thread->cr2 = ptr; - thread->trap_no = 14; + thread->trap_nr = X86_TRAP_PF; memset(&info, 0, sizeof(info)); info.si_signo = SIGSEGV; @@ -217,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = 1; /* - * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, 0 means "don't write anything" not "write it at + * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ ret = -EFAULT; @@ -248,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, - 0); + NULL); break; } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc673..35c5e543f550 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -18,6 +18,7 @@ #include <asm/e820.h> #include <asm/time.h> #include <asm/irq.h> +#include <asm/io_apic.h> #include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -91,8 +92,8 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, - .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; @@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); @@ -117,3 +120,10 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; + +struct x86_io_apic_ops x86_io_apic_ops = { + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e62728e30b01..bd18149b2b0f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(__thread_has_fpu(tsk)); - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; /* |
