summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/sleep.c7
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/apic/apic.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/apic/vector.c276
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c3
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c38
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c13
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event.h3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c33
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c4
-rw-r--r--arch/x86/kernel/early-quirks.c105
-rw-r--r--arch/x86/kernel/ioport.c12
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/kprobes/core.c12
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/pvclock.c4
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/sysfb_efi.c14
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/tsc_msr.c2
-rw-r--r--arch/x86/kernel/uprobes.c22
30 files changed, 579 insertions, 174 deletions
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d1daead5fcdd..adb3eaf8fe2a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
#include <asm/cacheflush.h>
#include <asm/realmode.h>
+#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
do_suspend_lowlevel();
+ unpause_graph_tracing();
return 0;
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 29fa475ec518..c986d0b3bc35 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -71,8 +71,8 @@ int amd_cache_northbridges(void)
while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
i++;
- if (i == 0)
- return 0;
+ if (!i)
+ return -ENODEV;
nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2f69e3b184f6..a3e1f8497f8c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1587,6 +1587,9 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
+ if (skip_ioapic_setup)
+ return;
+
ir_stat = irq_remapping_prepare();
if (ir_stat < 0 && !x2apic_supported())
return;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f25321894ad2..fdb0fbfb1197 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2521,6 +2521,7 @@ void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
const struct cpumask *mask;
+ struct irq_desc *desc;
struct irq_data *idata;
struct irq_chip *chip;
@@ -2536,7 +2537,9 @@ void __init setup_ioapic_dest(void)
if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
continue;
- idata = irq_get_irq_data(irq);
+ desc = irq_to_desc(irq);
+ raw_spin_lock_irq(&desc->lock);
+ idata = irq_desc_get_irq_data(desc);
/*
* Honour affinities which have been set in early boot
@@ -2550,6 +2553,7 @@ void __init setup_ioapic_dest(void)
/* Might be lapic_chip for irq 0 */
if (chip->irq_set_affinity)
chip->irq_set_affinity(idata, mask, false);
+ raw_spin_unlock_irq(&desc->lock);
}
}
#endif
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59c8f25..df6b4eeac0bd 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -30,7 +30,7 @@ struct apic_chip_data {
struct irq_domain *x86_vector_domain;
static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask;
+static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
static struct irq_chip lapic_controller;
#ifdef CONFIG_X86_IO_APIC
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -116,35 +116,47 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
*/
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, err;
+ int cpu, vector;
- if (d->move_in_progress)
+ /*
+ * If there is still a move in progress or the previous move has not
+ * been cleaned up completely, tell the caller to come back later.
+ */
+ if (d->move_in_progress ||
+ cpumask_intersects(d->old_domain, cpu_online_mask))
return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
- err = -ENOSPC;
cpumask_clear(d->old_domain);
+ cpumask_clear(searched_cpumask);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
- int new_cpu, vector, offset;
+ int new_cpu, offset;
+ /* Get the possible target cpus for @mask/@cpu from the apic */
apic->vector_allocation_domain(cpu, vector_cpumask, mask);
+ /*
+ * Clear the offline cpus from @vector_cpumask for searching
+ * and verify whether the result overlaps with @mask. If true,
+ * then the call to apic->cpu_mask_to_apicid_and() will
+ * succeed as well. If not, no point in trying to find a
+ * vector in this mask.
+ */
+ cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
+ if (!cpumask_intersects(vector_searchmask, mask))
+ goto next_cpu;
+
if (cpumask_subset(vector_cpumask, d->domain)) {
- err = 0;
if (cpumask_equal(vector_cpumask, d->domain))
- break;
+ goto success;
/*
- * New cpumask using the vector is a proper subset of
- * the current in use mask. So cleanup the vector
- * allocation for the members that are not used anymore.
+ * Mark the cpus which are not longer in the mask for
+ * cleanup.
*/
- cpumask_andnot(d->old_domain, d->domain,
- vector_cpumask);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_and(d->domain, d->domain, vector_cpumask);
- break;
+ cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
+ vector = d->cfg.vector;
+ goto update;
}
vector = current_vector;
@@ -156,45 +168,61 @@ next:
vector = FIRST_EXTERNAL_VECTOR + offset;
}
- if (unlikely(current_vector == vector)) {
- cpumask_or(d->old_domain, d->old_domain,
- vector_cpumask);
- cpumask_andnot(vector_cpumask, mask, d->old_domain);
- cpu = cpumask_first_and(vector_cpumask,
- cpu_online_mask);
- continue;
- }
+ /* If the search wrapped around, try the next cpu */
+ if (unlikely(current_vector == vector))
+ goto next_cpu;
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
+ for_each_cpu(new_cpu, vector_searchmask) {
if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (d->cfg.vector) {
+ /* Schedule the old vector for cleanup on all cpus */
+ if (d->cfg.vector)
cpumask_copy(d->old_domain, d->domain);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- }
- for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
+ for_each_cpu(new_cpu, vector_searchmask)
per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
- d->cfg.vector = vector;
- cpumask_copy(d->domain, vector_cpumask);
- err = 0;
- break;
- }
+ goto update;
- if (!err) {
- /* cache destination APIC IDs into cfg->dest_apicid */
- err = apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid);
+next_cpu:
+ /*
+ * We exclude the current @vector_cpumask from the requested
+ * @mask and try again with the next online cpu in the
+ * result. We cannot modify @mask, so we use @vector_cpumask
+ * as a temporary buffer here as it will be reassigned when
+ * calling apic->vector_allocation_domain() above.
+ */
+ cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, searched_cpumask);
+ cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+ continue;
}
+ return -ENOSPC;
- return err;
+update:
+ /*
+ * Exclude offline cpus from the cleanup mask and set the
+ * move_in_progress flag when the result is not empty.
+ */
+ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+ d->move_in_progress = !cpumask_empty(d->old_domain);
+ d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
+ d->cfg.vector = vector;
+ cpumask_copy(d->domain, vector_cpumask);
+success:
+ /*
+ * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
+ * as we already established, that mask & d->domain & cpu_online_mask
+ * is not empty.
+ */
+ BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
+ &d->cfg.dest_apicid));
+ return 0;
}
static int assign_irq_vector(int irq, struct apic_chip_data *data,
@@ -224,11 +252,10 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
struct irq_desc *desc;
- unsigned long flags;
int cpu, vector;
- raw_spin_lock_irqsave(&vector_lock, flags);
- BUG_ON(!data->cfg.vector);
+ if (!data->cfg.vector)
+ return;
vector = data->cfg.vector;
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
@@ -237,10 +264,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress)) {
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ /*
+ * If move is in progress or the old_domain mask is not empty,
+ * i.e. the cleanup IPI has not been processed yet, we need to remove
+ * the old references to desc from all cpus vector tables.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain))
return;
- }
desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
@@ -253,7 +283,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
}
}
data->move_in_progress = 0;
- raw_spin_unlock_irqrestore(&vector_lock, flags);
}
void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -274,19 +303,24 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
static void x86_vector_free_irqs(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
+ struct apic_chip_data *apic_data;
struct irq_data *irq_data;
+ unsigned long flags;
int i;
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
clear_irq_vector(virq + i, irq_data->chip_data);
- free_apic_chip_data(irq_data->chip_data);
+ apic_data = irq_data->chip_data;
+ irq_domain_reset_irq_data(irq_data);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apic_data);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs())
legacy_irq_data[virq + i] = NULL;
#endif
- irq_domain_reset_irq_data(irq_data);
}
}
}
@@ -404,6 +438,8 @@ int __init arch_early_irq_init(void)
arch_init_htirq_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
return arch_early_ioapic_init();
}
@@ -492,14 +528,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
return -EINVAL;
err = assign_irq_vector(irq, data, dest);
- if (err) {
- if (assign_irq_vector(irq, data,
- irq_data_get_affinity_mask(irq_data)))
- pr_err("Failed to recover vector for irq %d\n", irq);
- return err;
- }
-
- return IRQ_SET_MASK_OK;
+ return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
@@ -511,20 +540,12 @@ static struct irq_chip lapic_controller = {
#ifdef CONFIG_SMP
static void __send_cleanup_vector(struct apic_chip_data *data)
{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
-
- for_each_cpu_and(i, data->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i),
- IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
+ raw_spin_lock(&vector_lock);
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
data->move_in_progress = 0;
+ if (!cpumask_empty(data->old_domain))
+ apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+ raw_spin_unlock(&vector_lock);
}
void send_cleanup_vector(struct irq_cfg *cfg)
@@ -568,12 +589,25 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
/*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
+ * Nothing to cleanup if irq migration is in progress
+ * or this cpu is not set in the cleanup mask.
*/
- if (data->move_in_progress)
+ if (data->move_in_progress ||
+ !cpumask_test_cpu(me, data->old_domain))
goto unlock;
+ /*
+ * We have two cases to handle here:
+ * 1) vector is unchanged but the target mask got reduced
+ * 2) vector and the target mask has changed
+ *
+ * #1 is obvious, but in #2 we have two vectors with the same
+ * irq descriptor: the old and the new vector. So we need to
+ * make sure that we only cleanup the old vector. The new
+ * vector has the current @vector number in the config and
+ * this cpu is part of the target mask. We better leave that
+ * one alone.
+ */
if (vector == data->cfg.vector &&
cpumask_test_cpu(me, data->domain))
goto unlock;
@@ -591,6 +625,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
}
@@ -619,12 +654,99 @@ void irq_complete_move(struct irq_cfg *cfg)
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}
-void irq_force_complete_move(int irq)
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_data *irqdata = irq_desc_get_irq_data(desc);
+ struct apic_chip_data *data = apic_chip_data(irqdata);
+ struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ unsigned int cpu;
+
+ if (!cfg)
+ return;
+
+ /*
+ * This is tricky. If the cleanup of @data->old_domain has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ */
+ raw_spin_lock(&vector_lock);
+ /*
+ * Clean out all offline cpus (including the outgoing one) from the
+ * old_domain mask.
+ */
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
- if (cfg)
- __irq_complete_move(cfg, cfg->vector);
+ /*
+ * If move_in_progress is cleared and the old_domain mask is empty,
+ * then there is nothing to cleanup. fixup_irqs() will take care of
+ * the stale vectors on the outgoing cpu.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
+ raw_spin_unlock(&vector_lock);
+ return;
+ }
+
+ /*
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (data->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqdata->irq, cfg->old_vector);
+ }
+ /*
+ * If old_domain is not empty, then other cpus still have the irq
+ * descriptor set in their vector array. Clean it up.
+ */
+ for_each_cpu(cpu, data->old_domain)
+ per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+
+ /* Cleanup the left overs of the (half finished) move */
+ cpumask_clear(data->old_domain);
+ data->move_in_progress = 0;
+ raw_spin_unlock(&vector_lock);
}
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 0a850100c594..2658e2af74ec 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -29,7 +29,7 @@ static char gen_pool_buf[MCE_POOLSZ];
void mce_gen_pool_process(void)
{
struct llist_node *head;
- struct mce_evt_llist *node;
+ struct mce_evt_llist *node, *tmp;
struct mce *mce;
head = llist_del_all(&mce_event_llist);
@@ -37,7 +37,7 @@ void mce_gen_pool_process(void)
return;
head = llist_reverse_order(head);
- llist_for_each_entry(node, head, llnode) {
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
mce = &node->mce;
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 2c5aaf8c2e2f..05538582a809 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -385,6 +385,9 @@ static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ if (static_cpu_has(X86_FEATURE_HWP))
+ wrmsrl_safe(MSR_HWP_STATUS, 0);
+
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
/* Check for violation of core thermal thresholds*/
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce47402eb2f9..ac8975a65280 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -555,10 +555,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
cd.data = NULL;
cd.size = 0;
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data) {
+ /* try built-in microcode if no initrd */
+ if (!size) {
if (!load_builtin_intel_microcode(&cd))
return UCODE_ERROR;
+ } else {
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data)
+ return UCODE_ERROR;
}
return get_matching_model_microcode(0, start, cd.data, cd.size,
@@ -694,7 +698,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -732,16 +736,20 @@ void __init load_ucode_intel_bsp(void)
struct boot_params *p;
p = (struct boot_params *)__pa_nodebug(&boot_params);
- start = p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
- _load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- start, size);
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ start = (size ? p->hdr.ramdisk_image : 0);
+
+ _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
size = boot_params.hdr.ramdisk_size;
+ start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
@@ -752,20 +760,14 @@ void load_ucode_intel_ap(void)
struct mc_saved_data *mc_saved_data_p;
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
- unsigned long initrd_start_addr;
enum ucode_state ret;
#ifdef CONFIG_X86_32
- unsigned long *initrd_start_p;
- mc_saved_in_initrd_p =
- (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+ mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
- initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
#else
- mc_saved_data_p = &mc_saved_data;
mc_saved_in_initrd_p = mc_saved_in_initrd;
- initrd_start_addr = initrd_start;
+ mc_saved_data_p = &mc_saved_data;
#endif
/*
@@ -777,7 +779,7 @@ void load_ucode_intel_ap(void)
collect_cpu_info_early(&uci);
ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ get_initrd_start_addr(), &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 20e242ea1bc4..cfc4a966e2b9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -152,6 +152,11 @@ static struct clocksource hyperv_cs = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
+static unsigned char hv_get_nmi_reason(void)
+{
+ return 0;
+}
+
static void __init ms_hyperv_init_platform(void)
{
/*
@@ -191,6 +196,13 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
mark_tsc_unstable("running on Hyper-V");
+
+ /*
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
+ */
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3b533cf37c74..b5624fafa44a 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -444,11 +444,24 @@ static void __init print_mtrr_state(void)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
+/* PAT setup for BP. We need to go through sync steps here */
+void __init mtrr_bp_pat_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+}
+
/* Grab all of the MTRR state for this CPU into *state */
bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
- unsigned long flags;
unsigned lo, dummy;
unsigned int i;
@@ -481,15 +494,6 @@ bool __init get_mtrr_state(void)
mtrr_state_set = 1;
- /* PAT setup for BP. We need to go through sync steps here */
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index f891b4750f04..fa77ac8291f0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -752,6 +752,9 @@ void __init mtrr_bp_init(void)
/* BIOS may override */
__mtrr_enabled = get_mtrr_state();
+ if (mtrr_enabled())
+ mtrr_bp_pat_init();
+
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
mtrr_if->set_all();
@@ -759,8 +762,16 @@ void __init mtrr_bp_init(void)
}
}
- if (!mtrr_enabled())
+ if (!mtrr_enabled()) {
pr_info("MTRR: Disabled\n");
+
+ /*
+ * PAT initialization relies on MTRR's rendezvous handler.
+ * Skip PAT init until the handler can initialize both
+ * features independently.
+ */
+ pat_disable("MTRRs disabled, skipping PAT initialization too.");
+ }
}
void mtrr_ap_init(void)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 951884dcc433..6c7ced07d16d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
+void mtrr_bp_pat_init(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bf79d7c97df..a3aeb2cc361e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -593,6 +593,19 @@ void x86_pmu_disable_all(void)
}
}
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index d0e35ebb2adb..ee70445fbb1f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -591,6 +591,7 @@ struct x86_pmu {
pebs_active :1,
pebs_broken :1;
int pebs_record_size;
+ int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
@@ -907,6 +908,8 @@ void intel_pmu_lbr_init_hsw(void);
void intel_pmu_lbr_init_skl(void);
+void intel_pmu_pebs_data_source_nhm(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e2a430021e46..5f82cd59f0e5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1458,7 +1458,15 @@ static __initconst const u64 slm_hw_cache_event_ids
};
/*
- * Use from PMIs where the LBRs are already disabled.
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
*/
static void __intel_pmu_disable_all(void)
{
@@ -1840,6 +1848,16 @@ again:
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
handled++;
x86_pmu.drain_pebs(regs);
+ /*
+ * There are cases where, even though, the PEBS ovfl bit is set
+ * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+ * overflow bits set for their counters. We must clear them
+ * here because they have been processed as exact samples in
+ * the drain_pebs() routine. They must not be processed again
+ * in the for_each_bit_set() loop for regular samples below.
+ */
+ status &= ~cpuc->pebs_enabled;
+ status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
}
/*
@@ -1885,7 +1903,10 @@ again:
goto again;
done:
- __intel_pmu_enable_all(0, true);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ __intel_pmu_enable_all(0, true);
+
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -3315,6 +3336,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
pr_cont("Nehalem events, ");
@@ -3377,6 +3399,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
pr_cont("Westmere events, ");
break;
@@ -3578,7 +3601,7 @@ __init int intel_pmu_init(void)
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
- ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+ ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
c->weight = hweight64(c->idxmsk64);
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index a316ca96f1b6..fc704ed587e8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -211,6 +211,20 @@ static void __put_rmid(u32 rmid)
list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
+static void cqm_cleanup(void)
+{
+ int i;
+
+ if (!cqm_rmid_ptrs)
+ return;
+
+ for (i = 0; i < cqm_max_rmid; i++)
+ kfree(cqm_rmid_ptrs[i]);
+
+ kfree(cqm_rmid_ptrs);
+ cqm_rmid_ptrs = NULL;
+}
+
static int intel_cqm_setup_rmid_cache(void)
{
struct cqm_rmid_entry *entry;
@@ -218,7 +232,7 @@ static int intel_cqm_setup_rmid_cache(void)
int r = 0;
nr_rmids = cqm_max_rmid + 1;
- cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
nr_rmids, GFP_KERNEL);
if (!cqm_rmid_ptrs)
return -ENOMEM;
@@ -249,11 +263,9 @@ static int intel_cqm_setup_rmid_cache(void)
mutex_unlock(&cache_mutex);
return 0;
-fail:
- while (r--)
- kfree(cqm_rmid_ptrs[r]);
- kfree(cqm_rmid_ptrs);
+fail:
+ cqm_cleanup();
return -ENOMEM;
}
@@ -281,9 +293,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
/*
* Events that target same task are placed into the same cache group.
+ * Mark it as a multi event group, so that we update ->count
+ * for every event rather than just the group leader later.
*/
- if (a->hw.target == b->hw.target)
+ if (a->hw.target == b->hw.target) {
+ b->hw.is_group_event = true;
return true;
+ }
/*
* Are we an inherited event?
@@ -849,6 +865,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
bool conflict = false;
u32 rmid;
+ event->hw.is_group_event = false;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -940,7 +957,9 @@ static u64 intel_cqm_event_count(struct perf_event *event)
return __perf_event_count(event);
/*
- * Only the group leader gets to report values. This stops us
+ * Only the group leader gets to report values except in case of
+ * multiple events in the same group, we still need to read the
+ * other events.This stops us
* reporting duplicate values to userspace, and gives us a clear
* rule for which task gets to report the values.
*
@@ -948,7 +967,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
* specific packages - we forfeit that ability when we create
* task events.
*/
- if (!cqm_group_leader(event))
+ if (!cqm_group_leader(event) && !event->hw.is_group_event)
return 0;
/*
@@ -1315,7 +1334,7 @@ static const struct x86_cpu_id intel_cqm_match[] = {
static int __init intel_cqm_init(void)
{
- char *str, scale[20];
+ char *str = NULL, scale[20];
int i, cpu, ret;
if (!x86_match_cpu(intel_cqm_match))
@@ -1375,16 +1394,25 @@ static int __init intel_cqm_init(void)
cqm_pick_event_reader(i);
}
- __perf_cpu_notifier(intel_cqm_cpu_notifier);
-
ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
- if (ret)
+ if (ret) {
pr_err("Intel CQM perf registration failed: %d\n", ret);
- else
- pr_info("Intel CQM monitoring enabled\n");
+ goto out;
+ }
+
+ pr_info("Intel CQM monitoring enabled\n");
+ /*
+ * Register the hot cpu notifier once we are sure cqm
+ * is enabled to avoid notifier leak.
+ */
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
out:
cpu_notifier_register_done();
+ if (ret) {
+ kfree(str);
+ cqm_cleanup();
+ }
return ret;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c7755548..1e7de3cefc9c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -51,7 +51,8 @@ union intel_x86_pebs_dse {
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-static const u64 pebs_data_source[] = {
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
@@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = {
OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
};
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -269,7 +278,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+ buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -286,7 +295,7 @@ static int alloc_pebs_buffer(int cpu)
per_cpu(insn_buffer, cpu) = ibuffer;
}
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+ max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
ds->pebs_index = ds->pebs_buffer_base;
@@ -1101,6 +1110,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
void *at;
u64 pebs_status;
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
if (base == NULL)
return NULL;
@@ -1186,7 +1202,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = (top - at) / x86_pmu.pebs_record_size;
+ n = top - at;
if (n <= 0)
return;
@@ -1296,6 +1312,7 @@ void __init intel_ds_init(void)
x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
if (x86_pmu.pebs) {
char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
int format = x86_pmu.intel_cap.pebs_format;
@@ -1304,6 +1321,14 @@ void __init intel_ds_init(void)
case 0:
printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ /*
+ * Using >PAGE_SIZE buffers makes the WRMSR to
+ * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+ * mysteriously hang on Core2.
+ *
+ * As a workaround, we don't do this.
+ */
+ x86_pmu.pebs_buffer_size = PAGE_SIZE;
x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 868e1194337f..49e35d003b74 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -694,6 +694,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
/* clear STOP and INT from current entry */
buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->stop_pos]->intr = 0;
buf->topa_index[buf->intr_pos]->intr = 0;
/* how many pages till the STOP marker */
@@ -718,6 +719,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
buf->intr_pos = idx;
buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->stop_pos]->intr = 1;
buf->topa_index[buf->intr_pos]->intr = 1;
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
index 5b0c232d1ee6..b931095e86d4 100644
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -263,7 +263,9 @@ again:
goto again;
done:
- knc_pmu_enable_all(0);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ knc_pmu_enable_all(0);
return handled;
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index db9a675e751b..9fdf1d330727 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -11,7 +11,11 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
@@ -21,6 +25,9 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -589,6 +603,61 @@ static void __init force_disable_hpet(int num, int slot, int func)
#endif
}
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ dev_err("Cannot power up Apple AirPort card\n");
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ dev_err("Cannot iomap Apple AirPort card\n");
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
@@ -602,12 +671,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -637,9 +700,13 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -648,7 +715,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -657,6 +724,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -684,25 +752,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
if (!(type & 0x80))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 37dae792dbbe..589b3193f102 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
struct pt_regs *regs = current_pt_regs();
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
@@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f8062aaf5df9..61521dc19c10 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -462,7 +462,7 @@ void fixup_irqs(void)
* non intr-remapping case, we can't wait till this interrupt
* arrives at this cpu before completing the irq move.
*/
- irq_force_complete_move(irq);
+ irq_force_complete_move(desc);
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
@@ -470,6 +470,15 @@ void fixup_irqs(void)
}
chip = irq_data_get_irq_chip(data);
+ /*
+ * The interrupt descriptor might have been cleaned up
+ * already, but it is not yet removed from the radix tree
+ */
+ if (!chip) {
+ raw_spin_unlock(&desc->lock);
+ continue;
+ }
+
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..023c442c33bb 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -959,7 +959,19 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* normal page fault.
*/
regs->ip = (unsigned long)cur->addr;
+ /*
+ * Trap flag (TF) has been set here because this fault
+ * happened where the single stepping will be done.
+ * So clear it by resetting the current kprobe:
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ /*
+ * If the TF flag was set before the kprobe hit,
+ * don't touch it:
+ */
regs->flags |= kcb->kprobe_old_flags;
+
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e835d263a33b..4cbb60fbff3e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
asmlinkage extern void ret_from_fork(void);
@@ -411,6 +412,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#ifdef CONFIG_XEN
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..bf0ce75735b0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -66,6 +66,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
do {
version = __pvclock_read_cycles(src, &ret, &flags);
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
} while ((src->version & 1) || version != src->version);
return flags & valid_flags;
@@ -80,6 +82,8 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
do {
version = __pvclock_read_cycles(src, &ret, &flags);
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
} while ((src->version & 1) || version != src->version);
if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 02693dd9a079..f660d63f40fe 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index b285d4e8c68e..5da924bbf0a0 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -106,14 +106,24 @@ static int __init efifb_set_system(const struct dmi_system_id *id)
continue;
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
resource_size_t start, end;
+ unsigned long flags;
+
+ flags = pci_resource_flags(dev, i);
+ if (!(flags & IORESOURCE_MEM))
+ continue;
+
+ if (flags & IORESOURCE_UNSET)
+ continue;
+
+ if (pci_resource_len(dev, i) == 0)
+ continue;
start = pci_resource_start(dev, i);
- if (start == 0)
- break;
end = pci_resource_end(dev, i);
if (screen_info.lfb_base >= start &&
screen_info.lfb_base < end) {
found_bar = 1;
+ break;
}
}
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..679302c312f8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -109,6 +109,12 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
+/*
+ * In IST context, we explicitly disable preemption. This serves two
+ * purposes: it makes it much less likely that we would accidentally
+ * schedule in IST context and it will force a warning if we somehow
+ * manage to schedule by accident.
+ */
void ist_enter(struct pt_regs *regs)
{
if (user_mode(regs)) {
@@ -123,13 +129,7 @@ void ist_enter(struct pt_regs *regs)
rcu_nmi_enter();
}
- /*
- * We are atomic because we're on the IST stack; or we're on
- * x86_32, in which case we still shouldn't schedule; or we're
- * on x86_64 and entered from user mode, in which case we're
- * still atomic unless ist_begin_non_atomic is called.
- */
- preempt_count_add(HARDIRQ_OFFSET);
+ preempt_disable();
/* This code is a bit fragile. Test it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
@@ -137,7 +137,7 @@ void ist_enter(struct pt_regs *regs)
void ist_exit(struct pt_regs *regs)
{
- preempt_count_sub(HARDIRQ_OFFSET);
+ preempt_enable_no_resched();
if (!user_mode(regs))
rcu_nmi_exit();
@@ -168,7 +168,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
BUG_ON((unsigned long)(current_top_of_stack() -
current_stack_pointer()) >= THREAD_SIZE);
- preempt_count_sub(HARDIRQ_OFFSET);
+ preempt_enable_no_resched();
}
/**
@@ -178,7 +178,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
*/
void ist_end_non_atomic(void)
{
- preempt_count_add(HARDIRQ_OFFSET);
+ preempt_disable();
}
static nokprobe_inline int
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 92ae6acac8a7..6aa0f4d9eea6 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -92,7 +92,7 @@ unsigned long try_msr_calibrate_tsc(void)
if (freq_desc_tables[cpu_index].msr_plat) {
rdmsr(MSR_PLATFORM_INFO, lo, hi);
- ratio = (lo >> 8) & 0x1f;
+ ratio = (lo >> 8) & 0xff;
} else {
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
ratio = (hi >> 8) & 0x1f;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index bf4db6eaec8f..c6aace2bbe08 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
*cursor &= 0xfe;
}
/*
- * Similar treatment for VEX3 prefix.
- * TODO: add XOP/EVEX treatment when insn decoder supports them
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
*/
- if (insn->vex_prefix.nbytes == 3) {
+ if (insn->vex_prefix.nbytes >= 3) {
/*
* vex2: c5 rvvvvLpp (has no b bit)
* vex3/xop: c4/8f rxbmmmmm wvvvvLpp
* evex: 62 rxbR00mm wvvvv1pp zllBVaaa
- * (evex will need setting of both b and x since
- * in non-sib encoding evex.x is 4th bit of MODRM.rm)
- * Setting VEX3.b (setting because it has inverted meaning):
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
*/
cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
- *cursor |= 0x20;
+ *cursor |= 0x60;
}
/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
reg = MODRM_REG(insn); /* Fetch modrm.reg */
reg2 = 0xff; /* Fetch vex.vvvv */
- if (insn->vex_prefix.nbytes == 2)
- reg2 = insn->vex_prefix.bytes[1];
- else if (insn->vex_prefix.nbytes == 3)
+ if (insn->vex_prefix.nbytes)
reg2 = insn->vex_prefix.bytes[2];
/*
- * TODO: add XOP, EXEV vvvv reading.
+ * TODO: add XOP vvvv reading.
*
* vex.vvvv field is in bits 6-3, bits are inverted.
* But in 32-bit mode, high-order bit may be ignored.