aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/alternative.c15
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c14
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/amd.c251
-rw-r--r--arch/x86/kernel/cpu/bugs.c1087
-rw-r--r--arch/x86/kernel/cpu/common.c207
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c26
-rw-r--r--arch/x86/kernel/cpu/match.c44
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c31
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c71
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c12
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/topology.c5
-rw-r--r--arch/x86/kernel/cpu/tsx.c33
-rw-r--r--arch/x86/kernel/crash.c17
-rw-r--r--arch/x86/kernel/dumpstack.c11
-rw-r--r--arch/x86/kernel/early-quirks.c10
-rw-r--r--arch/x86/kernel/fpu/init.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c8
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/i8259.c39
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/pmem.c7
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/quirks.c10
-rw-r--r--arch/x86/kernel/reboot.c88
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c25
-rw-r--r--arch/x86/kernel/step.c3
-rw-r--r--arch/x86/kernel/sys_x86_64.c7
-rw-r--r--arch/x86/kernel/sysfb_efi.c16
-rw-r--r--arch/x86/kernel/unwind_orc.c17
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
43 files changed, 1644 insertions, 506 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b35c34cfbe52..2f0fa294d617 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -156,6 +156,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
@@ -1351,6 +1354,17 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d)
return 0;
}
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ } else {
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
+ }
+ return 0;
+}
+
static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
@@ -1475,6 +1489,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 918a23704c0c..33882d5ab25d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -366,6 +366,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("alt table %px, -> %px", start, end);
+
+ /*
+ * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -426,6 +437,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insnbuf, insnbuf_sz);
}
+
+ kasan_enable_current();
}
#ifdef CONFIG_SMP
@@ -677,8 +690,8 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 9791828f3fcd..9318fe7d850e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -166,7 +166,7 @@ static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
@@ -403,10 +403,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
- rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
- } while (rsvd != new);
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
- rsvd &= ~APIC_EILVT_MASKED;
+ rsvd = new & ~APIC_EILVT_MASKED;
if (rsvd && rsvd != vector)
pr_info("LVT offset %d assigned for vector 0x%02x\n",
offset, rsvd);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7fc76edbf3ba..aea77278e9b4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2449,17 +2449,21 @@ static int io_apic_get_redir_entries(int ioapic)
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
+ unsigned int ret;
+
/*
* dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
* gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- if (!ioapic_initialized)
- return gsi_top;
+ ret = ioapic_dynirq_base ? : gsi_top;
+
/*
- * For DT enabled machines ioapic_dynirq_base is irrelevant and not
- * updated. So simply return @from if ioapic_dynirq_base == 0.
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and
+ * always 0. gsi_top can be 0 if there is no IO/APIC registered.
+ * 0 is an invalid interrupt number for dynamic allocations. Return
+ * @from instead.
*/
- return ioapic_dynirq_base ? : from;
+ return ret ? : from;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8e70c2ba21b3..fb17767552ef 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -102,7 +102,10 @@ static void init_x2apic_ldr(void)
static int x2apic_phys_probe(void)
{
- if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
+ if (!x2apic_mode)
+ return 0;
+
+ if (x2apic_phys || x2apic_fadt_phys())
return 1;
return apic == &apic_x2apic_phys;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f7151cd03cb0..3d7a8049f637 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -247,12 +247,6 @@ extern int (*console_blank_hook)(int);
#endif
/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define APM_MINOR_DEV 134
-
-/*
* Various options can be changed at boot time as follows:
* (We allow underscores for compatibility with the modules code)
* apm=on/off enable/disable APM
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 98c23126f751..84667781c41d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -23,11 +23,6 @@
#include "cpu.h"
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static const int amd_erratum_1054[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-
/*
* nodes_per_socket: Stores the number of nodes per socket.
* Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -35,6 +30,83 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
*/
static u32 nodes_per_socket = 1;
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE().
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+static const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+static const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
+static const int amd_zenbleed[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
+ AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
+
+static const int amd_erratum_1485[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf),
+ AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf));
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
+{
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 4) | cpu->x86_stepping;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -199,6 +271,15 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
return;
}
#endif
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
static void init_amd_k7(struct cpuinfo_x86 *c)
@@ -789,8 +870,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
-#define MSR_AMD64_DE_CFG 0xC0011029
-
static void init_amd_ln(struct cpuinfo_x86 *c)
{
/*
@@ -885,12 +964,62 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
- /*
- * Fix erratum 1076: CPB feature bit not being set in CPUID.
- * Always set it, except when running under a hypervisor.
- */
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
- set_cpu_cap(c, X86_FEATURE_CPB);
+ /* Fix up CPUID bits, but only if not virtualised. */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+
+ /* Erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010b; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608105; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701032; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+
+ default:
+ return false;
+ break;
+ }
+
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
+
+static void zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has_amd_erratum(c, amd_zenbleed))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ }
}
static void init_amd(struct cpuinfo_x86 *c)
@@ -951,16 +1080,16 @@ static void init_amd(struct cpuinfo_x86 *c)
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/*
* Verify that the MSR write was successful (could be running
* under a hypervisor) and only then assume that LFENCE is
* serializing.
*/
- ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
- if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+ ret = rdmsrl_safe(MSR_AMD64_DE_CFG, &val);
+ if (!ret && (val & MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)) {
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
} else {
@@ -995,6 +1124,12 @@ static void init_amd(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
+
+ zenbleed_check(c);
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ cpu_has_amd_erratum(c, amd_erratum_1485))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
}
#ifdef CONFIG_X86_32
@@ -1090,73 +1225,6 @@ static const struct cpu_dev amd_cpu_dev = {
cpu_dev_register(amd_cpu_dev);
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
void set_dr_addr_mask(unsigned long mask, int dr)
{
if (!boot_cpu_has(X86_FEATURE_BPEXT))
@@ -1175,3 +1243,18 @@ void set_dr_addr_mask(unsigned long mask, int dr)
break;
}
}
+
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ zenbleed_check(c);
+}
+
+void amd_check_microcode(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
+}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c524fa1f4c0e..6e1acbdd27a5 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -9,7 +9,6 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/nospec.h>
@@ -25,34 +24,69 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
#include <asm/pgtable.h>
-#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
+#include <linux/bpf.h>
#include "cpu.h"
static void __init spectre_v1_select_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
+static void __init retbleed_select_mitigation(void);
+static void __init spectre_v2_user_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init mds_print_mitigation(void);
+static void __init md_clear_update_mitigation(void);
+static void __init md_clear_select_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
+static void __init gds_select_mitigation(void);
-/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+
+/* The current value of the SPEC_CTRL MSR with task-specific bits set */
+DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+
static DEFINE_MUTEX(spec_ctrl_mutex);
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
/*
- * The vendor and possibly platform specific bits which can be modified in
- * x86_spec_ctrl_base.
+ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
+ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+void update_spec_ctrl_cond(u64 val)
+{
+ if (this_cpu_read(x86_spec_ctrl_current) == val)
+ return;
+
+ this_cpu_write(x86_spec_ctrl_current, val);
+
+ /*
+ * When KERNEL_IBRS this MSR is written on return-to-user, unless
+ * forced the update can be delayed until that time.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
+u64 spec_ctrl_current(void)
+{
+ return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
/*
* AMD specific MSR info for Speculative Store Bypass control.
@@ -75,107 +109,61 @@ EXPORT_SYMBOL_GPL(mds_user_clear);
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-
- /*
- * identify_boot_cpu() initialized SMT support information, let the
- * core code know.
- */
- cpu_smt_check_topology();
-
- if (!IS_ENABLED(CONFIG_SMP)) {
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
- }
+/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
+DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+void __init cpu_select_mitigations(void)
+{
/*
* Read the SPEC_CTRL MSR to account for reserved bits which may
* have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
* init code as it is not enumerated and depends on the family.
*/
- if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- /* Allow STIBP in MSR_SPEC_CTRL if supported */
- if (boot_cpu_has(X86_FEATURE_STIBP))
- x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
- ssb_select_mitigation();
- l1tf_select_mitigation();
- mds_select_mitigation();
- taa_select_mitigation();
- srbds_select_mitigation();
-
- /*
- * As MDS and TAA mitigations are inter-related, print MDS
- * mitigation until after TAA mitigation selection is done.
- */
- mds_print_mitigation();
-
- arch_smt_update();
-
-#ifdef CONFIG_X86_32
/*
- * Check whether we are able to run this kernel safely on SMP.
- *
- * - i386 is no longer supported.
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
+ * retbleed_select_mitigation() relies on the state set by
+ * spectre_v2_select_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
*/
- if (boot_cpu_data.x86 < 4)
- panic("Kernel requires i486+ for 'invlpg' and other features");
-
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
-
- fpu__init_check_bugs();
-#else /* CONFIG_X86_64 */
- alternative_instructions();
-
+ retbleed_select_mitigation();
/*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
+ * spectre_v2_user_select_mitigation() relies on the state set by
+ * retbleed_select_mitigation(); specifically the STIBP selection is
+ * forced for UNRET.
*/
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-#endif
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ md_clear_select_mitigation();
+ srbds_select_mitigation();
+ gds_select_mitigation();
}
+/*
+ * NOTE: For VMX, this function is not called in the vmexit path.
+ * It uses vmx_spec_ctrl_restore_host() instead.
+ */
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+ u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
struct thread_info *ti = current_thread_info();
- /* Is MSR_SPEC_CTRL implemented ? */
if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- /*
- * Restrict guest_spec_ctrl to supported values. Clear the
- * modifiable bits in the host base value and or the
- * modifiable bits from the guest value.
- */
- guestval = hostval & ~x86_spec_ctrl_mask;
- guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
-
- /* SSBD controlled in MSR_SPEC_CTRL */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD))
- hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
-
- /* Conditional STIBP enabled? */
- if (static_branch_unlikely(&switch_to_cond_stibp))
- hostval |= stibp_tif_to_spec_ctrl(ti->flags);
-
if (hostval != guestval) {
msrval = setguest ? guestval : hostval;
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
@@ -256,14 +244,6 @@ static void __init mds_select_mitigation(void)
}
}
-static void __init mds_print_mitigation(void)
-{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
- return;
-
- pr_info("%s\n", mds_strings[mds_mitigation]);
-}
-
static int __init mds_cmdline(char *str)
{
if (!boot_cpu_has_bug(X86_BUG_MDS))
@@ -311,7 +291,7 @@ static void __init taa_select_mitigation(void)
/* TSX previously disabled by tsx=off */
if (!boot_cpu_has(X86_FEATURE_RTM)) {
taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
- goto out;
+ return;
}
if (cpu_mitigations_off()) {
@@ -325,7 +305,7 @@ static void __init taa_select_mitigation(void)
*/
if (taa_mitigation == TAA_MITIGATION_OFF &&
mds_mitigation == MDS_MITIGATION_OFF)
- goto out;
+ return;
if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
taa_mitigation = TAA_MITIGATION_VERW;
@@ -357,18 +337,6 @@ static void __init taa_select_mitigation(void)
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
-
- /*
- * Update MDS mitigation, if necessary, as the mds_user_clear is
- * now enabled for TAA mitigation.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
-out:
- pr_info("%s\n", taa_strings[taa_mitigation]);
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -393,6 +361,154 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
+ boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) ||
+ cpu_mitigations_off()) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
+ boot_cpu_has(X86_FEATURE_RTM)))
+ static_branch_enable(&mds_user_clear);
+ else
+ static_branch_enable(&mmio_stale_data_clear);
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&mds_idle_clear);
+
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+
+ if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static void __init md_clear_update_mitigation(void)
+{
+ if (cpu_mitigations_off())
+ return;
+
+ if (!static_key_enabled(&mds_user_clear))
+ goto out;
+
+ /*
+ * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
+ * mitigation, if necessary.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_select_mitigation();
+ }
+ if (mmio_mitigation == MMIO_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_select_mitigation();
+ }
+out:
+ if (boot_cpu_has_bug(X86_BUG_MDS))
+ pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
+ else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
+ pr_info("MMIO Stale Data: Unknown: No mitigations\n");
+}
+
+static void __init md_clear_select_mitigation(void)
+{
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+
+ /*
+ * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
+ * and print their mitigation after MDS, TAA and MMIO Stale Data
+ * mitigation selection is done.
+ */
+ md_clear_update_mitigation();
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
@@ -453,11 +569,13 @@ static void __init srbds_select_mitigation(void)
return;
/*
- * Check to see if this is one of the MDS_NO systems supporting
- * TSX that are only exposed to SRBDS when TSX is enabled.
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
*/
ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
@@ -484,6 +602,149 @@ static int __init srbds_parse_cmdline(char *str)
early_param("srbds", srbds_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
+#else
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
+#endif
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ return;
+ };
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ goto out;
+ }
+
+ if (cpu_mitigations_off())
+ gds_mitigation = GDS_MITIGATION_OFF;
+ /* Will verify below that mitigation _can_ be disabled */
+
+ /* No microcode */
+ if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ } else {
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ }
+ goto out;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+
+ update_gds_msr();
+out:
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -575,12 +836,103 @@ static int __init nospectre_v1_cmdline(char *str)
}
early_param("nospectre_v1", nospectre_v1_cmdline);
-#undef pr_fmt
-#define pr_fmt(fmt) "Spectre V2 : " fmt
-
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
+#undef pr_fmt
+#define pr_fmt(fmt) "RETBleed: " fmt
+
+enum retbleed_mitigation {
+ RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_IBRS,
+ RETBLEED_MITIGATION_EIBRS,
+};
+
+enum retbleed_mitigation_cmd {
+ RETBLEED_CMD_OFF,
+ RETBLEED_CMD_AUTO,
+};
+
+const char * const retbleed_strings[] = {
+ [RETBLEED_MITIGATION_NONE] = "Vulnerable",
+ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
+ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+};
+
+static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
+ RETBLEED_MITIGATION_NONE;
+static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
+ RETBLEED_CMD_AUTO;
+
+static int __init retbleed_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ retbleed_cmd = RETBLEED_CMD_OFF;
+ else if (!strcmp(str, "auto"))
+ retbleed_cmd = RETBLEED_CMD_AUTO;
+ else
+ pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str);
+
+ return 0;
+}
+early_param("retbleed", retbleed_parse_cmdline);
+
+#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
+#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n"
+#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
+
+static void __init retbleed_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ return;
+
+ switch (retbleed_cmd) {
+ case RETBLEED_CMD_OFF:
+ return;
+
+ case RETBLEED_CMD_AUTO:
+ default:
+ /*
+ * The Intel mitigation (IBRS) was already selected in
+ * spectre_v2_select_mitigation().
+ */
+
+ break;
+ }
+
+ switch (retbleed_mitigation) {
+ default:
+ break;
+ }
+
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ pr_err(RETBLEED_INTEL_MSG);
+ }
+ }
+
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
SPECTRE_V2_USER_NONE;
static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
@@ -607,6 +959,33 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
@@ -621,7 +1000,11 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
+ SPECTRE_V2_CMD_IBRS,
};
enum spectre_v2_user_cmd {
@@ -662,13 +1045,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
+static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
+
static enum spectre_v2_user_cmd __init
-spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
+spectre_v2_parse_user_cmdline(void)
{
char arg[20];
int ret, i;
- switch (v2_cmd) {
+ switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
case SPECTRE_V2_CMD_FORCE:
@@ -694,8 +1079,20 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
return SPECTRE_V2_USER_CMD_AUTO;
}
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
+
+static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
+}
+
static void __init
-spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
+spectre_v2_user_select_mitigation(void)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
bool smt_possible = IS_ENABLED(CONFIG_SMP);
@@ -708,7 +1105,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
smt_possible = false;
- cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ cmd = spectre_v2_parse_user_cmdline();
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
goto set_mode;
@@ -756,10 +1153,19 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
}
/*
- * If enhanced IBRS is enabled or SMT impossible, STIBP is not
- * required.
+ * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
+ *
+ * Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. This disables the implicit cross-thread protection,
+ * so allow for STIBP to be selected in that case.
*/
- if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return;
/*
@@ -771,12 +1177,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- /*
- * If STIBP is not available, clear the STIBP mode.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- mode = SPECTRE_V2_USER_NONE;
-
spectre_v2_user_stibp = mode;
set_mode:
@@ -785,9 +1185,12 @@ set_mode:
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+ [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
};
static const struct {
@@ -798,9 +1201,14 @@ static const struct {
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "eibrs", SPECTRE_V2_CMD_EIBRS, false },
+ { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
+ { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
+ { "ibrs", SPECTRE_V2_CMD_IBRS, false },
};
static void __init spec_v2_print_cond(const char *reason, bool secure)
@@ -836,16 +1244,48 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!IS_ENABLED(CONFIG_RETPOLINE)) {
- pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_EIBRS ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("%s selected but not Intel CPU. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+ pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
+ pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -854,6 +1294,80 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
+/* Disable in-kernel use of non-RSB RET predictors */
+static void __init spec_ctrl_disable_kernel_rrsba(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ if (ia32_cap & ARCH_CAP_RRSBA) {
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+}
+
+static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
+{
+ /*
+ * Similar to context switches, there are two types of RSB attacks
+ * after VM exit:
+ *
+ * 1) RSB underflow
+ *
+ * 2) Poisoned RSB entry
+ *
+ * When retpoline is enabled, both are mitigated by filling/clearing
+ * the RSB.
+ *
+ * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
+ * prediction isolation protections, RSB still needs to be cleared
+ * because of #2. Note that SMEP provides no protection here, unlike
+ * user-space-poisoned RSB entries.
+ *
+ * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB
+ * bug is present then a LITE version of RSB protection is required,
+ * just a single call needs to retire before a RET is executed.
+ */
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ return;
+
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS:
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
+ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ }
+ return;
+
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
+ return;
+ }
+
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
+ dump_stack();
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -874,85 +1388,161 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_IBRS_ENHANCED;
- /* Force it so VMEXIT will restore correctly */
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- goto specv2_set_mode;
+ mode = SPECTRE_V2_EIBRS;
+ break;
}
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+
+ if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_cmd != RETBLEED_CMD_OFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ mode = SPECTRE_V2_IBRS;
+ break;
+ }
+
+ mode = spectre_v2_select_retpoline();
break;
- case SPECTRE_V2_CMD_RETPOLINE_AMD:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_amd;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ mode = SPECTRE_V2_LFENCE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_generic;
+ mode = SPECTRE_V2_RETPOLINE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_IBRS:
+ mode = SPECTRE_V2_IBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ mode = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ mode = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ mode = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
- pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
- return;
-retpoline_auto:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- retpoline_amd:
- if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
- pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
- goto retpoline_generic;
- }
- mode = SPECTRE_V2_RETPOLINE_AMD;
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
- } else {
- retpoline_generic:
- mode = SPECTRE_V2_RETPOLINE_GENERIC;
+ if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_ibrs_mode(mode)) {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ /* fallthrough */
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
}
-specv2_set_mode:
+ /*
+ * Disable alternate RSB predictions in kernel when indirect CALLs and
+ * JMPs gets protection against BHI and Intramode-BTI, but RET
+ * prediction from a non-RSB predictor is still a risk.
+ */
+ if (mode == SPECTRE_V2_EIBRS_LFENCE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_RETPOLINE)
+ spec_ctrl_disable_kernel_rrsba();
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
/*
- * If spectre v2 protection has been enabled, unconditionally fill
- * RSB during a context switch; this protects against two independent
- * issues:
+ * If Spectre v2 protection has been enabled, fill the RSB during a
+ * context switch. In general there are two types of RSB attacks
+ * across context switches, for which the CALLs/RETs may be unbalanced.
+ *
+ * 1) RSB underflow
+ *
+ * Some Intel parts have "bottomless RSB". When the RSB is empty,
+ * speculated return targets may come from the branch predictor,
+ * which could have a user-poisoned BTB or BHB entry.
+ *
+ * AMD has it even worse: *all* returns are speculated from the BTB,
+ * regardless of the state of the RSB.
+ *
+ * When IBRS or eIBRS is enabled, the "user -> kernel" attack
+ * scenario is mitigated by the IBRS branch prediction isolation
+ * properties, so the RSB buffer filling wouldn't be necessary to
+ * protect against this type of attack.
+ *
+ * The "user -> user" attack scenario is mitigated by RSB filling.
+ *
+ * 2) Poisoned RSB entry
+ *
+ * If the 'next' in-kernel return stack is shorter than 'prev',
+ * 'next' could be tricked into speculating with a user-poisoned RSB
+ * entry.
+ *
+ * The "user -> kernel" attack scenario is mitigated by SMEP and
+ * eIBRS.
+ *
+ * The "user -> user" scenario, also known as SpectreBHB, requires
+ * RSB clearing.
*
- * - RSB underflow (and switch to BTB) on Skylake+
- * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
+ * So to mitigate all cases, unconditionally fill RSB on context
+ * switches.
+ *
+ * FIXME: Is this pointless for retbleed-affected AMD?
*/
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
+ spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+
/*
- * Retpoline means the kernel is safe because it has no indirect
- * branches. Enhanced IBRS protects firmware too, so, enable restricted
- * speculation around firmware calls only when Enhanced IBRS isn't
- * supported.
+ * Retpoline protects the kernel, but doesn't protect firmware. IBRS
+ * and Enhanced IBRS protect firmware too, so enable IBRS around
+ * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
+ * enabled.
*
* Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
* the user might select retpoline on the kernel command line and if
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
+ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
/* Set up IBPB and STIBP depending on the general spectre V2 command */
- spectre_v2_user_select_mitigation(cmd);
+ spectre_v2_cmd = cmd;
}
static void update_stibp_msr(void * __unused)
{
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
+ update_spec_ctrl(val);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -987,6 +1577,8 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -998,19 +1590,26 @@ static void update_mds_branch_idle(void)
if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
return;
- if (sched_smt_active())
+ if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
- else
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (ia32_cap & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
+ }
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
void arch_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
@@ -1046,6 +1645,16 @@ void arch_smt_update(void)
break;
}
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1150,16 +1759,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
}
/*
- * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
- * bit in the mask to allow guests to use the mitigation even in the
- * case where the host does not enable it.
- */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD)) {
- x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
- }
-
- /*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
@@ -1176,7 +1775,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
@@ -1255,7 +1854,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
-
/*
* With strict mode for both IBPB and STIBP, the instruction
* code paths avoid checking this task flag and instead,
@@ -1295,6 +1893,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
@@ -1382,7 +1982,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
void x86_spec_ctrl_setup_ap(void)
{
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ update_spec_ctrl(x86_spec_ctrl_base);
if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
x86_amd_ssb_disable();
@@ -1598,9 +2198,26 @@ static ssize_t tsx_async_abort_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t mmio_stale_data_show_state(char *buf)
+{
+ if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
+ return sysfs_emit(buf, "Unknown: No mitigations\n");
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -1630,11 +2247,56 @@ static char *ibpb_state(void)
return "";
}
+static char *pbrsb_eibrs_state(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+ boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+ return ", PBRSB-eIBRS: SW sequence";
+ else
+ return ", PBRSB-eIBRS: Vulnerable";
+ } else {
+ return ", PBRSB-eIBRS: Not affected";
+ }
+}
+
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+ return sprintf(buf, "Vulnerable: LFENCE\n");
+
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sprintf(buf, "%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_v2_module_string());
+}
+
static ssize_t srbds_show_state(char *buf)
{
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
+static ssize_t retbleed_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -1655,12 +2317,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- spectre_v2_module_string());
+ return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
@@ -1682,6 +2339,16 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SRBDS:
return srbds_show_state(buf);
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_MMIO_UNKNOWN:
+ return mmio_stale_data_show_state(buf);
+
+ case X86_BUG_RETBLEED:
+ return retbleed_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
default:
break;
}
@@ -1733,4 +2400,22 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN);
+ else
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7e7400af7797..d315e928b95c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,14 +13,20 @@
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <asm/stackprotector.h>
+#include <linux/utsname.h>
+
+#include <asm/alternative.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/archrandom.h>
@@ -56,6 +62,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
#endif
+#include <asm/set_memory.h>
#include "cpu.h"
@@ -954,6 +961,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define MSBDS_ONLY BIT(5)
#define NO_SWAPGS BIT(6)
#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_MMIO BIT(8)
+#define NO_EIBRS_PBRSB BIT(9)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -971,6 +980,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
+ VULNWL_INTEL(TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+
VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
@@ -988,9 +1002,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1000,37 +1014,78 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
{}
};
+#define VULNBL(vendor, family, model, blacklist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, steppings, \
X86_FEATURE_ANY, issues)
+#define VULNBL_AMD(family, blacklist) \
+ VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_XEON_D,X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0xC), SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0xD), SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(CANNONLAKE_MOBILE,X86_STEPPING_ANY, RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_MOBILE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_XEON_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+
+ VULNBL_AMD(0x15, RETBLEED),
+ VULNBL_AMD(0x16, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED),
{}
};
@@ -1051,6 +1106,13 @@ u64 x86_read_arch_cap_msr(void)
return ia32_cap;
}
+static bool arch_cap_mmio_immune(u64 ia32_cap)
+{
+ return (ia32_cap & ARCH_CAP_FBSDP_NO &&
+ ia32_cap & ARCH_CAP_PSDP_NO &&
+ ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -1102,12 +1164,53 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
/*
* SRBDS affects CPUs which support RDRAND or RDSEED and are listed
* in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
*/
if ((cpu_has(c, X86_FEATURE_RDRAND) ||
cpu_has(c, X86_FEATURE_RDSEED)) &&
- cpu_matches(cpu_vuln_blacklist, SRBDS))
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
setup_force_cpu_bug(X86_BUG_SRBDS);
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ *
+ * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist,
+ * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.
+ */
+ if (!arch_cap_mmio_immune(ia32_cap)) {
+ if (cpu_matches(cpu_vuln_blacklist, MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN);
+ }
+
+ if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ }
+
+ if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
+ !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1193,8 +1296,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_set_bug_bits(c);
- fpu__init_system(c);
-
#ifdef CONFIG_X86_32
/*
* Regardless of whether PCID is enumerated, the SDM says
@@ -1583,6 +1684,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
}
static __init int setup_noclflush(char *arg)
@@ -1900,8 +2003,6 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu__init_cpu();
-
if (is_uv_system())
uv_cpu_init();
@@ -1965,8 +2066,6 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu__init_cpu();
-
load_fixmap_gdt(cpu);
}
#endif
@@ -1999,6 +2098,8 @@ void microcode_check(void)
perf_check_microcode();
+ amd_check_microcode();
+
/* Reload CPUID max function as it might've changed. */
info.cpuid_level = cpuid_eax(0);
@@ -2017,3 +2118,69 @@ void microcode_check(void)
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+
+void __init arch_cpu_finalize_init(void)
+{
+ identify_boot_cpu();
+
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology();
+
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+ cpu_select_mitigations();
+
+ arch_smt_update();
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
+
+ /*
+ * Must be before alternatives because it might set or clear
+ * feature bits.
+ */
+ fpu__init_system();
+ fpu__init_cpu();
+
+ alternative_instructions();
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4eb9bf68b122..ca1b8bf380a2 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -79,9 +79,11 @@ extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a5287b18a63f..76692590eff8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -71,7 +71,7 @@ static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
{
ring3mwait_disabled = true;
- return 0;
+ return 1;
}
__setup("ring3mwait=disable", ring3mwait_disable);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f406e3b85bdb..1125f752f126 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -585,8 +585,10 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
/*
* Ensure the task's closid and rmid are written before determining if
* the task is current that will decide if it will be interrupted.
+ * This pairs with the full barrier between the rq->curr update and
+ * resctrl_sched_in() during context switch.
*/
- barrier();
+ smp_mb();
/*
* By now, the task's closid and rmid are set. If the task is current
@@ -2140,19 +2142,23 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
t->closid = to->closid;
t->rmid = to->mon.rmid;
-#ifdef CONFIG_SMP
/*
- * This is safe on x86 w/o barriers as the ordering
- * of writing to task_cpu() and t->on_cpu is
- * reverse to the reading here. The detection is
- * inaccurate as tasks might move or schedule
- * before the smp function call takes place. In
- * such a case the function call is pointless, but
+ * Order the closid/rmid stores above before the loads
+ * in task_curr(). This pairs with the full barrier
+ * between the rq->curr update and resctrl_sched_in()
+ * during context switch.
+ */
+ smp_mb();
+
+ /*
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
* there is no other side effect.
*/
- if (mask && t->on_cpu)
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
-#endif
}
}
read_unlock(&tasklist_lock);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 751e59057466..ad6776081e60 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -16,12 +16,17 @@
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
- * { X86_VENDOR_INTEL, 6, 0x12 }
- * or to match a specific CPU feature
- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
+ * X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -53,3 +58,34 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);
+
+static const struct x86_cpu_desc *
+x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ const struct x86_cpu_desc *m;
+
+ for (m = match; m->x86_family | m->x86_model; m++) {
+ if (c->x86_vendor != m->x86_vendor)
+ continue;
+ if (c->x86 != m->x86_family)
+ continue;
+ if (c->x86_model != m->x86_model)
+ continue;
+ if (c->x86_stepping != m->x86_stepping)
+ continue;
+ return m;
+ }
+ return NULL;
+}
+
+bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
+{
+ const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
+
+ if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 9cc524be3c94..14dc3c1f7fb4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -354,7 +354,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2a13468f8773..8f36ccf26cec 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -295,11 +295,17 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -314,7 +320,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -352,6 +358,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -642,7 +651,7 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
@@ -1021,10 +1030,13 @@ static int mce_start(int *no_way_out)
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1068,7 +1080,8 @@ static int mce_end(int order)
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1083,6 +1096,10 @@ reset:
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a4e7e100ed26..5698e04803b5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -54,7 +54,9 @@ struct cont_desc {
};
static u32 ucode_new_rev;
-static u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
+/* One blob per node. */
+static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE];
/*
* Microcode patch container file is prepended to the initrd in cpio
@@ -210,7 +212,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
- patch = &amd_ucode_patch;
+ patch = &amd_ucode_patch[0];
#endif
desc.cpuid_1_eax = cpuid_1_eax;
@@ -222,7 +224,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
return ret;
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev >= mc->hdr.patch_id)
+
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (rev > mc->hdr.patch_id)
return ret;
if (!__apply_microcode_amd(mc)) {
@@ -304,8 +312,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- /* Check whether we have saved a new patch already: */
- if (*new_rev && rev < mc->hdr.patch_id) {
+ /*
+ * Check whether a new patch has been saved already. Also, allow application of
+ * the same revision in order to pick up SMT-thread-specific configuration even
+ * if the sibling SMT thread already has an up-to-date revision.
+ */
+ if (*new_rev && rev <= mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
*new_rev = mc->hdr.patch_id;
return;
@@ -319,8 +331,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,19 +349,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
+ ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret > UCODE_UPDATED)
return -EINVAL;
return 0;
}
-void reload_ucode_amd(void)
+void reload_ucode_amd(unsigned int cpu)
{
- struct microcode_amd *mc;
u32 rev, dummy;
+ struct microcode_amd *mc;
- mc = (struct microcode_amd *)amd_ucode_patch;
+ mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)];
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
@@ -521,7 +532,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
/* need to apply patch? */
- if (rev >= mc_amd->hdr.patch_id) {
+ if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
goto out;
}
@@ -688,9 +699,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
struct ucode_patch *p;
enum ucode_state ret;
@@ -703,22 +715,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
return ret;
}
- p = find_patch(0);
- if (!p) {
- return ret;
- } else {
- if (boot_cpu_data.microcode >= p->patch_id)
- return ret;
+ for_each_node(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
- ret = UCODE_NEW;
- }
+ p = find_patch(cpu);
+ if (!p)
+ continue;
- /* save BSP's matching patch for early load */
- if (!save)
- return ret;
+ if (c->microcode >= p->patch_id)
+ continue;
+
+ ret = UCODE_NEW;
- memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
- memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+ memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE);
+ memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+ }
return ret;
}
@@ -744,12 +756,11 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || !bsp)
+ if (!refresh_fw)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -766,7 +777,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index eab4de387ce6..963b989710f9 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -326,7 +326,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
#endif
}
-void reload_early_microcode(void)
+void reload_early_microcode(unsigned int cpu)
{
int vendor, family;
@@ -340,7 +340,7 @@ void reload_early_microcode(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- reload_ucode_amd();
+ reload_ucode_amd(cpu);
break;
default:
break;
@@ -773,9 +773,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -783,11 +783,11 @@ static void mc_bp_resume(void)
if (uci->valid && uci->mc)
microcode_ops->apply_microcode(cpu);
else if (!uci->mc)
- reload_early_microcode();
+ reload_early_microcode(cpu);
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 3aa0e5a45303..31ad79a0c6f3 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -662,7 +662,6 @@ void load_ucode_intel_ap(void)
else
iup = &intel_ucode_patch;
-reget:
if (!*iup) {
patch = __load_ucode_intel(&uci);
if (!patch)
@@ -673,12 +672,7 @@ reget:
uci.mc = *iup;
- if (apply_microcode_early(&uci, true)) {
- /* Mixed-silicon system? Try to refetch the proper patch: */
- *iup = NULL;
-
- goto reget;
- }
+ apply_microcode_early(&uci, true);
}
static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 5a52672e3f8b..90bd155d7e7a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -21,6 +21,7 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 71ca064e3794..31fe56a90cbf 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -44,7 +44,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
- smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
#endif
return 0;
}
@@ -68,7 +68,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
* Populate HT related information from sub-leaf level 0.
*/
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
sub_index = 1;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 032509adf9de..88a553ee7704 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -55,24 +55,6 @@ void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool __init tsx_ctrl_is_supported(void)
-{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
- /*
- * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
- * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
- *
- * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
- * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
- * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
- * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
- * tsx= cmdline requests will do nothing on CPUs without
- * MSR_IA32_TSX_CTRL support.
- */
- return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
-}
-
static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
{
if (boot_cpu_has_bug(X86_BUG_TAA))
@@ -86,9 +68,22 @@ void __init tsx_init(void)
char arg[5] = {};
int ret;
- if (!tsx_ctrl_is_supported())
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (!(x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR))
return;
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+
ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
if (ret >= 0) {
if (!strcmp(arg, "on")) {
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 91b3483e5085..e40bb50c5c4f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -35,7 +35,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
/* Used while preparing memory map entries for second kernel */
@@ -86,15 +85,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -153,12 +143,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2b5886401e5f..7e698c45760c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -171,7 +171,6 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
- stack = stack ? : get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -190,9 +189,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for (stack = stack ?: get_stack_pointer(task, regs);
+ stack;
+ stack = stack_info.next_sp) {
const char *stack_name;
+ stack = PTR_ALIGN(stack, sizeof(long));
+
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* We weren't on a valid stack. It's possible that
@@ -326,7 +329,7 @@ unsigned long oops_begin(void)
}
NOKPROBE_SYMBOL(oops_begin);
-void __noreturn rewind_stack_do_exit(int signr);
+void __noreturn rewind_stack_and_make_dead(int signr);
void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
@@ -361,7 +364,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
* reuse the task stack and that existing poisons are invalid.
*/
kasan_unpoison_task_stack(current);
- rewind_stack_do_exit(signr);
+ rewind_stack_and_make_dead(signr);
}
NOKPROBE_SYMBOL(oops_end);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 50d5848bf22e..d5352b0ae091 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
.stolen_size = gen9_stolen_size,
};
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_I830_IDS(&i830_early_ops),
INTEL_I845G_IDS(&i845_early_ops),
@@ -584,6 +585,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
u16 device;
int i;
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
@@ -696,7 +704,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ 0, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 9692ccc583bb..644372a10c89 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -49,7 +49,7 @@ void fpu__init_cpu(void)
fpu__init_cpu_xstate();
}
-static bool fpu__probe_without_cpuid(void)
+static bool __init fpu__probe_without_cpuid(void)
{
unsigned long cr0;
u16 fsw, fcw;
@@ -67,7 +67,7 @@ static bool fpu__probe_without_cpuid(void)
return fsw == 0 && (fcw & 0x103f) == 0x003f;
}
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static void __init fpu__init_system_early_generic(void)
{
if (!boot_cpu_has(X86_FEATURE_CPUID) &&
!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
@@ -297,10 +297,10 @@ static void __init fpu__init_parse_early_param(void)
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
-void __init fpu__init_system(struct cpuinfo_x86 *c)
+void __init fpu__init_system(void)
{
fpu__init_parse_early_param();
- fpu__init_system_early_generic(c);
+ fpu__init_system_early_generic();
/*
* The FPU has to be operational for some of the
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 7d372db8bee1..e33b732ad337 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -811,6 +811,14 @@ void __init fpu__init_system_xstate(void)
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp();
+
+ /*
+ * CPU capabilities initialization runs before FPU init. So
+ * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
+ * functional, set the feature bit so depending code works.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
+
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 88dc38b4a147..90c2613af36b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -383,6 +383,8 @@ static void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
+ memset(__brk_base, 0,
+ (unsigned long) __brk_limit - (unsigned long) __brk_base);
}
static unsigned long get_cmd_line_ptr(void)
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index fe522691ac71..82753622f489 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,6 +32,7 @@
*/
static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
@@ -114,6 +115,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
lapic_assign_legacy_vector(irq, true);
}
@@ -300,15 +302,32 @@ static void unmask_8259A(void)
static int probe_8259A(void)
{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
unsigned long flags;
- unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
- unsigned char new_val;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
/*
- * Check to see if we have a PIC.
- * Mask all except the cascade and read
- * back the value we just wrote. If we don't
- * have a PIC, we will read 0xff as opposed to the
- * value we wrote.
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
*/
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -430,5 +449,9 @@ static int __init i8259A_init_ops(void)
return 0;
}
-
device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a0693b71cfc1..f2c215e1f64c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -72,8 +72,10 @@ void __init init_ISA_irqs(void)
legacy_pic->init(0);
- for (i = 0; i < nr_legacy_irqs(); i++)
+ for (i = 0; i < nr_legacy_irqs(); i++) {
irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
+ }
}
void __init init_IRQ(void)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 544bd41a514c..36b5a493e5b4 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -56,8 +56,8 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
op = container_of(kp, struct optimized_kprobe, kp);
- /* If op->list is not empty, op is under optimizing */
- if (list_empty(&op->list))
+ /* If op is optimized or under unoptimizing */
+ if (list_empty(&op->list) || optprobe_queued_unopt(op))
goto found;
}
}
@@ -328,7 +328,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
for (i = 1; i < op->optinsn.size; i++) {
p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
+ if (p && !kprobe_disarmed(p))
return -EEXIST;
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index cee45d46e67d..bd6ee0bfa1d2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -480,7 +480,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
ipi_bitmap <<= min - apic_id;
min = apic_id;
- } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 6b07faaa1579..23154d24b117 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -27,6 +27,11 @@ static __init int register_e820_pmem(void)
* simply here to trigger the module to load on demand.
*/
pdev = platform_device_alloc("e820_pmem", -1);
- return platform_device_add(pdev);
+
+ rc = platform_device_add(pdev);
+ if (rc)
+ platform_device_put(pdev);
+
+ return rc;
}
device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cd138bfd926c..e8d40a5979ec 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -434,7 +434,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
}
if (updmsr)
- wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+ update_spec_ctrl_cond(msr);
}
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 736348ead421..2ecf1dcc86b2 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -96,7 +96,7 @@ static void ich_force_hpet_resume(void)
static void ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(rcba);
+ u32 rcba;
int err = 0;
if (hpet_address || force_hpet_address)
@@ -186,7 +186,7 @@ static void hpet_print_force_info(void)
static void old_ich_force_hpet_resume(void)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (!force_hpet_address || !cached_dev)
return;
@@ -208,7 +208,7 @@ static void old_ich_force_hpet_resume(void)
static void old_ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (hpet_address || force_hpet_address)
return;
@@ -299,7 +299,7 @@ static void vt8237_force_hpet_resume(void)
static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -430,7 +430,7 @@ static void nvidia_force_hpet_resume(void)
static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index b0f3a996df15..444b8a691c14 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -536,33 +536,29 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
-static void emergency_vmx_disable_all(void)
+static void emergency_reboot_disable_virtualization(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
- * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
- * the machine, because the CPU blocks INIT when it's in VMX root.
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
* We can't take any locks and we may be on an inconsistent state, so
- * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
- * doesn't prevent a different CPU from being in VMX root operation.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx()) {
- /* Safely force _this_ CPU out of VMX root operation. */
- __cpu_emergency_vmxoff();
+ if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
- /* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -599,7 +595,7 @@ static void native_machine_emergency_restart(void)
unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
tboot_shutdown(TB_SHUTDOWN_REBOOT);
@@ -804,6 +800,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -826,7 +833,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -842,18 +856,32 @@ static void smp_send_nmi_allbutself(void)
apic->send_IPI_allbutself(NMI_VECTOR);
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -881,7 +909,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -911,6 +949,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b2b87b91f336..c94ed0b37bcd 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -33,7 +33,7 @@
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
-#include <asm/virtext.h>
+#include <asm/reboot.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -163,7 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -176,7 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
asmlinkage __visible void smp_reboot_interrupt(void)
{
ipi_entering_ack_irq();
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
irq_exit();
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8783d065f927..2e4f6a1ebca5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -96,6 +96,17 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
+
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
+
/* Logical package management. We might want to allocate that dynamically */
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
@@ -231,6 +242,7 @@ static void notrace start_secondary(void *unused)
#endif
load_current_idt();
cpu_init();
+ fpu__init_cpu();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
@@ -1594,10 +1606,10 @@ static bool wakeup_cpu0(void)
*/
static inline void mwait_play_dead(void)
{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
- void *mwait_ptr;
int i;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
@@ -1631,13 +1643,6 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
- /*
- * This should be a memory location in a cache line which is
- * unlikely to be touched by other processors. The actual
- * content is immaterial as it is not actually modified in any way.
- */
- mwait_ptr = &current_thread_info()->flags;
-
wbinvd();
while (1) {
@@ -1649,9 +1654,9 @@ static inline void mwait_play_dead(void)
* case where we return around the loop.
*/
mb();
- clflush(mwait_ptr);
+ clflush(md);
mb();
- __monitor(mwait_ptr, 0, 0);
+ __monitor(md, 0, 0);
mb();
__mwait(eax, 0);
/*
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 60d2c3798ba2..2f97d1a1032f 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -175,8 +175,7 @@ void set_task_blockstep(struct task_struct *task, bool on)
*
* NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
* task is current or it can't be running, otherwise we can race
- * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
- * PTRACE_KILL is not safe.
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced().
*/
local_irq_disable();
debugctl = get_debugctlmsr();
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6a78d4b36a79..9fb266c797fb 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -70,9 +70,6 @@ static int __init control_va_addr_alignment(char *str)
if (*str == 0)
return 1;
- if (*str == '=')
- str++;
-
if (!strcmp(str, "32"))
va_align.flags = ALIGN_VA_32;
else if (!strcmp(str, "64"))
@@ -82,11 +79,11 @@ static int __init control_va_addr_alignment(char *str)
else if (!strcmp(str, "on"))
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
else
- return 0;
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
return 1;
}
-__setup("align_va_addr", control_va_addr_alignment);
+__setup("align_va_addr=", control_va_addr_alignment);
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index 897da526e40e..5bc0fedb3342 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -265,6 +265,22 @@ static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = {
"Lenovo ideapad D330-10IGM"),
},
},
+ {
+ /* Lenovo IdeaPad Duet 3 10IGL5 with 1200x1920 portrait screen */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
+ "IdeaPad Duet 3 10IGL5"),
+ },
+ },
+ {
+ /* Lenovo Yoga Book X91F / X91L */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ /* Non exact match to match F + L versions */
+ DMI_MATCH(DMI_PRODUCT_NAME, "Lenovo YB1-X91"),
+ },
+ },
{},
};
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 4f17c1c94949..0c0f0eda327d 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -89,22 +89,27 @@ static struct orc_entry *orc_find(unsigned long ip);
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
struct ftrace_ops *ops;
- unsigned long caller;
+ unsigned long tramp_addr, offset;
ops = ftrace_ops_trampoline(ip);
if (!ops)
return NULL;
+ /* Set tramp_addr to the start of the code copied by the trampoline */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
- caller = (unsigned long)ftrace_regs_call;
+ tramp_addr = (unsigned long)ftrace_regs_caller;
else
- caller = (unsigned long)ftrace_call;
+ tramp_addr = (unsigned long)ftrace_caller;
+
+ /* Now place tramp_addr to the location within the trampoline ip is at */
+ offset = ip - ops->trampoline;
+ tramp_addr += offset;
/* Prevent unlikely recursion */
- if (ip == caller)
+ if (ip == tramp_addr)
return NULL;
- return orc_find(caller);
+ return orc_find(tramp_addr);
}
#else
static struct orc_entry *orc_ftrace_find(unsigned long ip)
@@ -663,7 +668,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
/* Otherwise, skip ahead to the user-specified starting frame: */
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->sp < (unsigned long)first_frame))
+ state->sp <= (unsigned long)first_frame))
unwind_next_frame(state);
return;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ae9e806a11de..3b87bca027b4 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -735,8 +735,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
break;
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -765,6 +766,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
+setup:
auprobe->branch.opc1 = opc1;
auprobe->branch.ilen = insn->length;
auprobe->branch.offs = insn->immediate.value;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 34c0652ca8b4..20d09355c9e0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -383,7 +383,7 @@ SECTIONS
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}