diff options
Diffstat (limited to 'arch/x86/kernel')
50 files changed, 1685 insertions, 654 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7b75658b7e9a..a5e41e66fcbd 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -140,6 +140,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9353bac92480..421facbeda91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -374,6 +374,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 insn_buff[MAX_PATCH_LEN]; DPRINTK("alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -434,6 +445,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); } + + kasan_enable_current(); } #ifdef CONFIG_SMP @@ -772,8 +785,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } else { local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4e4476b832be..a3b7b2fb04cb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -168,7 +168,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -410,10 +410,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; - rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); - } while (rsvd != new); + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); - rsvd &= ~APIC_EILVT_MASKED; + rsvd = new & ~APIC_EILVT_MASKED; if (rsvd && rsvd != vector) pr_info("LVT offset %d assigned for vector 0x%02x\n", offset, rsvd); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1622cff009c9..e4d63392c619 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2455,17 +2455,21 @@ static int io_apic_get_redir_entries(int ioapic) unsigned int arch_dynirq_lower_bound(unsigned int from) { + unsigned int ret; + /* * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - if (!ioapic_initialized) - return gsi_top; + ret = ioapic_dynirq_base ? : gsi_top; + /* - * For DT enabled machines ioapic_dynirq_base is irrelevant and not - * updated. So simply return @from if ioapic_dynirq_base == 0. + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. */ - return ioapic_dynirq_base ? : from; + return ret ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 032a00e5d9fa..76c80e191a1b 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..166d9991e711 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -238,12 +238,6 @@ extern int (*console_blank_hook)(int); #endif /* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 88cef978380b..533451498c8f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -26,11 +26,6 @@ #include "cpu.h" -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static const int amd_erratum_1054[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); - /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -38,6 +33,83 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); */ static u32 nodes_per_socket = 1; +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_stepping; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -587,7 +659,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * If BIOS has not enabled SME then don't advertise the * SME feature (set in scattered.c). * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV feature (set in scattered.c). + * SEV and SEV_ES feature (set in scattered.c). * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -618,6 +690,7 @@ clear_all: setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); } } @@ -794,8 +867,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -894,12 +965,73 @@ static void init_amd_zn(struct cpuinfo_x86 *c) node_reclaim_distance = 32; #endif + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } + /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + if (c->x86 == 0x17) + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107b; break; + case 0x60 ... 0x67: good_rev = 0x0860010c; break; + case 0x68 ... 0x6f: good_rev = 0x08608107; break; + case 0x70 ... 0x7f: good_rev = 0x08701033; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00009; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -956,8 +1088,8 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -989,6 +1121,12 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); + + zenbleed_check(c); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } #ifdef CONFIG_X86_32 @@ -1084,73 +1222,6 @@ static const struct cpu_dev amd_cpu_dev = { cpu_dev_register(amd_cpu_dev); -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - void set_dr_addr_mask(unsigned long mask, int dr) { if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -1169,3 +1240,18 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e817aaeef254..af748d1c78d4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -9,7 +9,6 @@ * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> -#include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> @@ -25,9 +24,7 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> @@ -37,23 +34,59 @@ static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); -static void __init mds_print_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init gds_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void update_spec_ctrl_cond(u64 val) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -76,107 +109,61 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); -void __init check_bugs(void) -{ - identify_boot_cpu(); - - /* - * identify_boot_cpu() initialized SMT support information, let the - * core code know. - */ - cpu_smt_check_topology(); - - if (!IS_ENABLED(CONFIG_SMP)) { - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); - } +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +void __init cpu_select_mitigations(void) +{ /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ - if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); - srbds_select_mitigation(); - - /* - * As MDS and TAA mitigations are inter-related, print MDS - * mitigation until after TAA mitigation selection is done. - */ - mds_print_mitigation(); - - arch_smt_update(); - -#ifdef CONFIG_X86_32 /* - * Check whether we are able to run this kernel safely on SMP. - * - * - i386 is no longer supported. - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. */ - if (boot_cpu_data.x86 < 4) - panic("Kernel requires i486+ for 'invlpg' and other features"); - - init_utsname()->machine[1] = - '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); - alternative_instructions(); - - fpu__init_check_bugs(); -#else /* CONFIG_X86_64 */ - alternative_instructions(); - + retbleed_select_mitigation(); /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET. */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -#endif + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); + srbds_select_mitigation(); + gds_select_mitigation(); } +/* + * NOTE: For VMX, this function is not called in the vmexit path. + * It uses vmx_spec_ctrl_restore_host() instead. + */ void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - if (hostval != guestval) { msrval = setguest ? guestval : hostval; wrmsrl(MSR_IA32_SPEC_CTRL, msrval); @@ -257,14 +244,6 @@ static void __init mds_select_mitigation(void) } } -static void __init mds_print_mitigation(void) -{ - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) - return; - - pr_info("%s\n", mds_strings[mds_mitigation]); -} - static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) @@ -312,7 +291,7 @@ static void __init taa_select_mitigation(void) /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; - goto out; + return; } if (cpu_mitigations_off()) { @@ -326,7 +305,7 @@ static void __init taa_select_mitigation(void) */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) - goto out; + return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; @@ -358,18 +337,6 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); - - /* - * Update MDS mitigation, if necessary, as the mds_user_clear is - * now enabled for TAA mitigation. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } -out: - pr_info("%s\n", taa_strings[taa_mitigation]); } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -394,6 +361,154 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { @@ -454,11 +569,13 @@ static void __init srbds_select_mitigation(void) return; /* - * Check to see if this is one of the MDS_NO systems supporting - * TSX that are only exposed to SRBDS when TSX is enabled. + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. */ ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; @@ -485,6 +602,149 @@ static int __init srbds_parse_cmdline(char *str) early_param("srbds", srbds_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_FORCE: + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } + goto out; + } + + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -576,12 +836,103 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, +}; + +const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + retbleed_cmd = RETBLEED_CMD_OFF; + else if (!strcmp(str, "auto")) + retbleed_cmd = RETBLEED_CMD_AUTO; + else + pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str); + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_AUTO: + default: + /* + * The Intel mitigation (IBRS) was already selected in + * spectre_v2_select_mitigation(). + */ + + break; + } + + switch (retbleed_mitigation) { + default: + break; + } + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = @@ -611,6 +962,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) @@ -652,6 +1004,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_EIBRS, SPECTRE_V2_CMD_EIBRS_RETPOLINE, SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -692,13 +1045,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -726,13 +1081,18 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) { - return (mode == SPECTRE_V2_EIBRS || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_EIBRS_LFENCE); + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} + +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -745,7 +1105,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; @@ -793,12 +1153,21 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not - * required. + * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP + * is not required. + * + * Intel's Enhanced IBRS also protects against cross-thread branch target + * injection in user-mode as the IBRS bit remains always set which + * implicitly enables cross-thread protections. However, in legacy IBRS + * mode, the IBRS bit is set only on kernel entry and cleared on return + * to userspace. AMD Automatic IBRS also does not protect userspace. + * These modes therefore disable the implicit cross-thread protection, + * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* @@ -820,9 +1189,10 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", - [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", - [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -840,6 +1210,7 @@ static const struct { { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -889,7 +1260,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -902,6 +1273,24 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -917,6 +1306,69 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -941,6 +1393,14 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + mode = spectre_v2_select_retpoline(); break; @@ -957,6 +1417,10 @@ static void __init spectre_v2_select_mitigation(void) mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + case SPECTRE_V2_CMD_EIBRS: mode = SPECTRE_V2_EIBRS; break; @@ -973,10 +1437,13 @@ static void __init spectre_v2_select_mitigation(void) if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_eibrs_mode(mode)) { - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + if (spectre_v2_in_ibrs_mode(mode)) { + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } } switch (mode) { @@ -984,6 +1451,12 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_EIBRS: break; + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + case SPECTRE_V2_LFENCE: case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); @@ -995,43 +1468,86 @@ static void __init spectre_v2_select_mitigation(void) break; } + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1066,6 +1582,8 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { + u64 ia32_cap = x86_read_arch_cap_msr(); + /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1077,14 +1595,17 @@ static void update_mds_branch_idle(void) if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; - if (sched_smt_active()) + if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); - else + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); + } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { @@ -1129,6 +1650,16 @@ void cpu_bugs_smt_update(void) break; } + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1233,16 +1764,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1259,7 +1780,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1387,6 +1908,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); break; default: return -ERANGE; @@ -1476,7 +1999,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1627,74 +2150,92 @@ static const char * const l1tf_vmx_states[] = { static ssize_t l1tf_show_state(char *buf) { if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && sched_smt_active())) { - return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation]); + return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); } - return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, - l1tf_vmx_states[l1tf_vmx_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t itlb_multihit_show_state(char *buf) { if (itlb_multihit_kvm_mitigation) - return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); else - return sprintf(buf, "KVM: Vulnerable\n"); + return sysfs_emit(buf, "KVM: Vulnerable\n"); } #else static ssize_t l1tf_show_state(char *buf) { - return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); } static ssize_t itlb_multihit_show_state(char *buf) { - return sprintf(buf, "Processor vulnerable\n"); + return sysfs_emit(buf, "Processor vulnerable\n"); } #endif static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - mds_strings[mds_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); } if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : - sched_smt_active() ? "mitigated" : "disabled")); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); } - return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t tsx_async_abort_show_state(char *buf) { if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || (taa_mitigation == TAA_MITIGATION_OFF)) - return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - return sprintf(buf, "%s; SMT Host state unknown\n", - taa_strings[taa_mitigation]); + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); } - return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], - sched_smt_active() ? "vulnerable" : "disabled"); + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); } static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { @@ -1724,56 +2265,80 @@ static char *ibpb_state(void) return ""; } +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sprintf(buf, "Vulnerable: LFENCE\n"); + return sysfs_emit(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sprintf(buf, "%s%s%s%s%s%s\n", - spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); } static ssize_t srbds_show_state(char *buf) { - return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); + return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + +static ssize_t retbleed_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { if (!boot_cpu_has_bug(bug)) - return sprintf(buf, "Not affected\n"); + return sysfs_emit(buf, "Not affected\n"); switch (bug) { case X86_BUG_CPU_MELTDOWN: if (boot_cpu_has(X86_FEATURE_PTI)) - return sprintf(buf, "Mitigation: PTI\n"); + return sysfs_emit(buf, "Mitigation: PTI\n"); if (hypervisor_is_type(X86_HYPER_XEN_PV)) - return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); break; case X86_BUG_SPECTRE_V1: - return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); + return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: - return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); case X86_BUG_L1TF: if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) @@ -1792,11 +2357,21 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SRBDS: return srbds_show_state(buf); + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + + case X86_BUG_GDS: + return gds_show_state(buf); + default: break; } - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Vulnerable\n"); } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -1843,4 +2418,22 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4c85ca112a2a..ae9d8aa3ae48 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,11 +17,16 @@ #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> +#include <linux/mem_encrypt.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/syscore_ops.h> #include <asm/stackprotector.h> +#include <linux/utsname.h> + +#include <asm/alternative.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/archrandom.h> @@ -57,6 +62,7 @@ #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> #endif +#include <asm/set_memory.h> #include "cpu.h" @@ -444,8 +450,6 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - struct pkru_state *pk; - /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -456,9 +460,6 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) return; cr4_set_bits(X86_CR4_PKE); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (pk) - pk->pkru = init_pkru_value; /* * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we @@ -961,6 +962,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + + if (c->extended_cpuid_level >= 0x80000021) + c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021); + init_scattered_cpuid_features(c); init_speculation_control(c); init_cqm(c); @@ -1025,6 +1032,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) +#define NO_EIBRS_PBRSB BIT(9) +#define NO_MMIO BIT(10) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1045,6 +1054,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1063,9 +1077,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1075,42 +1089,88 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_MMIO), {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) +/* CPU is affected by GDS */ +#define GDS BIT(6) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; @@ -1131,6 +1191,13 @@ u64 x86_read_arch_cap_msr(void) return ia32_cap; } +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1153,8 +1220,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); - if (ia32_cap & ARCH_CAP_IBRS_ALL) + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { @@ -1184,12 +1259,48 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * SRBDS affects CPUs which support RDRAND or RDSEED and are listed * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. */ if ((cpu_has(c, X86_FEATURE_RDRAND) || cpu_has(c, X86_FEATURE_RDSEED)) && - cpu_matches(cpu_vuln_blacklist, SRBDS)) + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) setup_force_cpu_bug(X86_BUG_SRBDS); + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1275,8 +1386,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); - fpu__init_system(c); - #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -1668,6 +1777,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); } static __init int setup_noclflush(char *arg) @@ -1966,8 +2077,6 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__init_cpu(); - if (is_uv_system()) uv_cpu_init(); @@ -2025,8 +2134,6 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__init_cpu(); - load_fixmap_gdt(cpu); } #endif @@ -2042,6 +2149,8 @@ void microcode_check(void) perf_check_microcode(); + amd_check_microcode(); + /* Reload CPUID max function as it might've changed. */ info.cpuid_level = cpuid_eax(0); @@ -2071,3 +2180,69 @@ void arch_smt_update(void) /* Check whether IPI broadcasting can be enabled */ apic_smt_update(); } + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4d04c127c4a7..8a64520b5310 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -76,9 +76,11 @@ extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern void update_gds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 3cbe24ca80ab..f50d4422c7b5 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index b232bd0be78d..43586b7586c0 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -88,8 +88,12 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - /* Socket ID is ApicId[6] for these processors. */ - c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + /* + * Socket ID is ApicId[6] for the processors with model <= 0x3 + * when running on host. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; cacheinfo_hygon_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { @@ -335,8 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 11d5c5950e2d..0418606ec3c0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -97,7 +97,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); @@ -187,6 +187,90 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme_early(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -339,6 +423,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) */ if (detect_extended_topology_early(c) < 0) detect_ht_early(c); + + /* + * Adjust the number of physical bits early because it affects the + * valid bits of the MTRR mask registers. + */ + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme_early(c); } #ifdef CONFIG_X86_32 @@ -540,90 +631,6 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } -#define MSR_IA32_TME_ACTIVATE 0x982 - -/* Helpers to access TME_ACTIVATE MSR */ -#define TME_ACTIVATE_LOCKED(x) (x & 0x1) -#define TME_ACTIVATE_ENABLED(x) (x & 0x2) - -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - -#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ - -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - -static void detect_tme(struct cpuinfo_x86 *c) -{ - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); - - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } - - if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { - pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; - return; - } - - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); - - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } - - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } - - /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. - */ - c->x86_phys_bits -= keyid_bits; -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -758,9 +765,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); - if (cpu_has(c, X86_FEATURE_TME)) - detect_tme(c); - init_intel_misc_features(c); if (tsx_ctrl_state == TSX_CTRL_ENABLE) diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 2f163e6646b6..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -16,12 +16,17 @@ * respective wildcard entries. * * A typical table entry would be to match a specific CPU - * { X86_VENDOR_INTEL, 6, 0x12 } - * or to match a specific CPU feature - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8a2b8e791314..f7e0c9549bb9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/kexec.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -315,6 +316,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + struct page *p; /* * Allow instrumentation around external facilities usage. Not that it @@ -370,6 +372,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -397,13 +413,16 @@ static int msr_to_offset(u32 msr) return -1; } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { - pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } show_stack_regs(regs); @@ -411,7 +430,14 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, while (true) cpu_relax(); +} +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + ex_handler_msr_mce(regs, false); return true; } @@ -447,17 +473,7 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, unsigned long error_code, unsigned long fault_addr) { - pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, - regs->ip, (void *)regs->ip); - - show_stack_regs(regs); - - panic("MCA architectural violation!\n"); - - while (true) - cpu_relax(); - + ex_handler_msr_mce(regs, true); return true; } @@ -2212,12 +2228,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a0e52bd00ecc..2852c99196fa 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -55,7 +55,9 @@ struct cont_desc { }; static u32 ucode_new_rev; -static u8 amd_ucode_patch[PATCH_MAX_SIZE]; + +/* One blob per node. */ +static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio @@ -429,7 +431,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch; + patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -441,7 +443,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -523,8 +531,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; @@ -538,8 +550,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -557,19 +568,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; return 0; } -void reload_ucode_amd(void) +void reload_ucode_amd(unsigned int cpu) { - struct microcode_amd *mc; u32 rev, dummy; + struct microcode_amd *mc; - mc = (struct microcode_amd *)amd_ucode_patch; + mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); @@ -689,7 +700,7 @@ static enum ucode_state apply_microcode_amd(int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ - if (rev >= mc_amd->hdr.patch_id) { + if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } @@ -783,6 +794,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, kfree(patch); return -EINVAL; } + patch->size = *patch_size; mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); proc_id = mc_hdr->processor_rev_id; @@ -834,9 +846,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { + struct cpuinfo_x86 *c; + unsigned int nid, cpu; struct ucode_patch *p; enum ucode_state ret; @@ -849,22 +862,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; } - p = find_patch(0); - if (!p) { - return ret; - } else { - if (boot_cpu_data.microcode >= p->patch_id) - return ret; + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); - ret = UCODE_NEW; - } + p = find_patch(cpu); + if (!p) + continue; - /* save BSP's matching patch for early load */ - if (!save) - return ret; + if (c->microcode >= p->patch_id) + continue; + + ret = UCODE_NEW; - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); + memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + } return ret; } @@ -890,12 +903,11 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); - bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || !bsp) + if (!refresh_fw) return UCODE_OK; if (c->x86 >= 0x15) @@ -910,7 +922,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, if (!verify_container(fw->data, fw->size, false)) goto fw_release; - ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c95a27513a30..834c5f723dae 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -322,7 +322,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(void) +void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -336,7 +336,7 @@ void reload_early_microcode(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - reload_ucode_amd(); + reload_ucode_amd(cpu); break; default: break; @@ -782,7 +782,7 @@ void microcode_bsp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); else if (!uci->mc) - reload_early_microcode(); + reload_early_microcode(cpu); } static struct syscore_ops mc_syscore_ops = { diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index b224d4dae2ff..896f456b3f5c 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -659,7 +659,6 @@ void load_ucode_intel_ap(void) else iup = &intel_ucode_patch; -reget: if (!*iup) { patch = __load_ucode_intel(&uci); if (!patch) @@ -670,12 +669,7 @@ reget: uci.mc = *iup; - if (apply_microcode_early(&uci, true)) { - /* Mixed-silicon system? Try to refetch the proper patch: */ - *iup = NULL; - - goto reget; - } + apply_microcode_early(&uci, true); } static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 1c2f9baf8483..51d95c4b692c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -82,7 +82,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); exiting_irq(); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 507039c20128..4482819bb13c 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -794,8 +794,6 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; - rcu_cpu_starting(smp_processor_id()); - /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..f186470c2e66 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -416,6 +416,7 @@ static int pseudo_lock_fn(void *_rdtgrp) struct pseudo_lock_region *plr = rdtgrp->plr; u32 rmid_p, closid_p; unsigned long i; + u64 saved_msr; #ifdef CONFIG_KASAN /* * The registers used for local register variables are also used @@ -459,6 +460,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); @@ -510,7 +512,7 @@ static int pseudo_lock_fn(void *_rdtgrp) __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -867,6 +869,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) static int measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; unsigned long i; u64 start, end; void *mem_r; @@ -875,6 +878,7 @@ static int measure_cycles_lat_fn(void *_plr) /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); mem_r = READ_ONCE(plr->kmem); /* @@ -891,7 +895,7 @@ static int measure_cycles_lat_fn(void *_plr) end = rdtsc_ordered(); trace_pseudo_lock_mem_latency((u32)(end - start)); } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -936,6 +940,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; struct perf_event *miss_event, *hit_event; int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; unsigned int line_size; unsigned int size; unsigned long i; @@ -969,6 +974,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); /* Initialize rest of local variables */ @@ -1027,7 +1033,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, */ rmb(); /* Re-enable hardware prefetchers */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); out_hit: perf_event_release_kernel(hit_event); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 28f786289fce..91016bb18d4f 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -311,7 +311,7 @@ static void update_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(); + resctrl_sched_in(current); } /* @@ -532,7 +532,7 @@ static void _update_task_closid_rmid(void *task) * Otherwise, the MSR is updated when the task is scheduled in. */ if (task == current) - resctrl_sched_in(); + resctrl_sched_in(task); } static void update_task_closid_rmid(struct task_struct *t) @@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else if (rdtgrp->type == RDTMON_GROUP) { if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else { rdt_last_cmd_puts("Can't move task to different control group\n"); return -EINVAL; @@ -577,8 +577,10 @@ static int __rdtgroup_move_task(struct task_struct *tsk, /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_sched_in() during context switch. */ - barrier(); + smp_mb(); /* * By now, the task's closid and rmid are set. If the task is current @@ -591,6 +593,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /** * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group * @r: Resource group @@ -606,8 +620,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { ret = 1; break; } @@ -702,12 +715,15 @@ unlock: static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; + pid_t pid; rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) - seq_printf(s, "%d\n", t->pid); + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } } rcu_read_unlock(); } @@ -2146,18 +2162,6 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); -} - /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -2175,22 +2179,26 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and resctrl_sched_in() + * during context switch. + */ + smp_mb(); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 53004dbd55c4..37f716eaf0e6 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,7 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, @@ -39,9 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 24da5ee4f022..5729ed7bb3e7 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; - smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; } @@ -107,7 +107,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); c->initial_apicid = edx; - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 032509adf9de..88a553ee7704 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -55,24 +55,6 @@ void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool __init tsx_ctrl_is_supported(void) -{ - u64 ia32_cap = x86_read_arch_cap_msr(); - - /* - * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this - * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. - * - * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a - * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES - * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get - * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, - * tsx= cmdline requests will do nothing on CPUs without - * MSR_IA32_TSX_CTRL support. - */ - return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); -} - static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) { if (boot_cpu_has_bug(X86_BUG_TAA)) @@ -86,9 +68,22 @@ void __init tsx_init(void) char arg[5] = {}; int ret; - if (!tsx_ctrl_is_supported()) + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR)) return; + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); if (ret >= 0) { if (!strcmp(arg, "on")) { diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0c319d09378d..539d8cf5904d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -37,7 +37,6 @@ #include <linux/kdebug.h> #include <asm/cpu.h> #include <asm/reboot.h> -#include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> @@ -94,15 +93,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Disable VMX or SVM if needed. - * - * We need to disable virtualization on all CPUs. - * Having VMX or SVM enabled on any CPU may break rebooting - * after the kdump kernel has finished its task. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); - /* * Disable Intel PT to stop its logging */ @@ -161,12 +151,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Booting kdump kernel with VMX or SVM enabled won't work, - * because (among other limitations) we can't disable paging - * with the virt flags. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); + cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index e07424e19274..9b2bbb66d0c8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -171,7 +171,6 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -190,9 +189,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { const char *stack_name; + stack = PTR_ALIGN(stack, sizeof(long)); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { /* * We weren't on a valid stack. It's possible that @@ -326,7 +329,7 @@ unsigned long oops_begin(void) } NOKPROBE_SYMBOL(oops_begin); -void __noreturn rewind_stack_do_exit(int signr); +void __noreturn rewind_stack_and_make_dead(int signr); void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -361,7 +364,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) * reuse the task stack and that existing poisons are invalid. */ kasan_unpoison_task_stack(current); - rewind_stack_do_exit(signr); + rewind_stack_and_make_dead(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index b271da0fa219..5e710ad6a3c0 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -50,7 +50,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -static bool fpu__probe_without_cpuid(void) +static bool __init fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -68,7 +68,7 @@ static bool fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static void __init fpu__init_system_early_generic(void) { if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { @@ -139,9 +139,6 @@ static void __init fpu__init_system_generic(void) unsigned int fpu_kernel_xstate_size; EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); -/* Get alignment of the TYPE. */ -#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) - /* * Enforce that 'MEMBER' is the last field of 'TYPE'. * @@ -149,8 +146,8 @@ EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); * because that's how C aligns structs. */ #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ - TYPE_ALIGN(TYPE))) + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) /* * We append the 'struct fpu' to the task_struct: @@ -293,10 +290,10 @@ static void __init fpu__init_parse_early_param(void) * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ -void __init fpu__init_system(struct cpuinfo_x86 *c) +void __init fpu__init_system(void) { fpu__init_parse_early_param(); - fpu__init_system_early_generic(c); + fpu__init_system_early_generic(); /* * The FPU has to be operational for some of the diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 046782df37a6..d8162f6baa5d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -805,6 +805,14 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + + /* + * CPU capabilities initialization runs before FPU init. So + * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely + * functional, set the feature bit so depending code works. + */ + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 206a4b6144c2..950286016f63 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -383,6 +383,8 @@ static void __init clear_bss(void) { memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); } static unsigned long get_cmd_line_ptr(void) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index fe522691ac71..82753622f489 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -114,6 +115,7 @@ static void make_8259A_irq(unsigned int irq) disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); + irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } @@ -300,15 +302,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -430,5 +449,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 16919a9671fa..faed27c8dc39 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -72,8 +72,10 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); - for (i = 0; i < nr_legacy_irqs(); i++) + for (i = 0; i < nr_legacy_irqs(); i++) { irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } } void __init init_IRQ(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index c205d77d57da..3700dc94847c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -358,6 +358,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) kernel_insn_init(insn, dest, MAX_INSN_SIZE); insn_get_length(insn); + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b348dd506d58..90d41b22c5c2 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -43,8 +43,8 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) /* This function only handles jump-optimized kprobe */ if (kp && kprobe_optimized(kp)) { op = container_of(kp, struct optimized_kprobe, kp); - /* If op->list is not empty, op is under optimizing */ - if (list_empty(&op->list)) + /* If op is optimized or under unoptimizing */ + if (list_empty(&op->list) || optprobe_queued_unopt(op)) goto found; } } @@ -314,7 +314,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) for (i = 1; i < op->optinsn.size; i++) { p = get_kprobe(op->kp.addr + i); - if (p && !kprobe_disabled(p)) + if (p && !kprobe_disarmed(p)) return -EEXIST; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 408b51aba293..f582dda8dd34 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -59,6 +59,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -584,14 +585,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) { + u64 val = 0; + kvm_guest_cpu_offline(false); +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif return 0; } static void kvm_resume(void) { kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif } static struct syscore_ops kvm_syscore_ops = { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d81e34e614e0..35dca8a5ca90 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,8 +24,8 @@ static int kvmclock __initdata = 1; static int kvmclock_vsyscall __initdata = 1; -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static int msr_kvm_system_time __ro_after_init; +static int msr_kvm_wall_clock __ro_after_init; static u64 kvm_sched_clock_offset __ro_after_init; static int __init parse_no_kvmclock(char *arg) @@ -189,7 +189,8 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { - native_write_msr(msr_kvm_system_time, 0, 0); + if (msr_kvm_system_time) + native_write_msr(msr_kvm_system_time, 0, 0); } static void __init kvmclock_init_mem(void) @@ -286,7 +287,10 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + } else { return; } diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 571e38c9ee1d..b8de27bb6e09 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -449,7 +449,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - wrmsrl(MSR_IA32_SPEC_CTRL, msr); + update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -659,6 +659,10 @@ static void amd_e400_idle(void) */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) + return 0; + if (c->x86_vendor != X86_VENDOR_INTEL) return 0; @@ -769,9 +773,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 352f876950ab..8ef71fa91ff8 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -293,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p); /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(); + resctrl_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 633788362906..b77f0d9dad55 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -610,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(); + resctrl_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1daf8f2aa21f..c205f30147cc 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -185,7 +185,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index fdef27a84d71..6fede2f00104 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -528,33 +528,29 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct pt_regs *regs) -{ - cpu_emergency_vmxoff(); -} +static inline void nmi_shootdown_cpus_on_restart(void); -/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ -static void emergency_vmx_disable_all(void) +static void emergency_reboot_disable_virtualization(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* - * Disable VMX on all CPUs before rebooting, otherwise we risk hanging - * the machine, because the CPU blocks INIT when it's in VMX root. + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. * * We can't take any locks and we may be on an inconsistent state, so - * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if VMX if off on _this_ CPU, as that - * doesn't prevent a different CPU from being in VMX root operation. + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx()) { - /* Safely force _this_ CPU out of VMX root operation. */ - __cpu_emergency_vmxoff(); + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); - /* Halt and exit VMX root operation on the other CPUs. */ - nmi_shootdown_cpus(vmxoff_nmi); + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); } } @@ -591,7 +587,7 @@ static void native_machine_emergency_restart(void) unsigned short mode; if (reboot_emergency) - emergency_vmx_disable_all(); + emergency_reboot_disable_virtualization(); tboot_shutdown(TB_SHUTDOWN_REBOOT); @@ -796,6 +792,17 @@ void machine_crash_shutdown(struct pt_regs *regs) /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; @@ -818,7 +825,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, regs); + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -829,18 +843,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -/* - * Halt all other CPUs, calling the specified function on each of them +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. * - * This function can be used to halt all other CPUs on crash - * or emergency reboot time. The function passed as parameter - * will be called inside a NMI handler on all CPUs. + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); @@ -868,7 +896,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) msecs--; } - /* Leave the nmi callback set */ + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); } /* @@ -898,6 +936,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* No other CPUs to shoot down */ } +static inline void nmi_shootdown_cpus_on_restart(void) { } + void run_crash_ipi_callback(struct pt_regs *regs) { } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b8d4e9c3c070..31ee37f87b2b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -31,7 +31,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> -#include <asm/virtext.h> +#include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -121,7 +121,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); irq_exit(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8367bd7a9a81..d6a8efff9c61 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -99,6 +99,17 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + /* Logical package management. We might want to allocate that dynamically */ unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); @@ -224,6 +235,8 @@ static void notrace start_secondary(void *unused) #endif load_current_idt(); cpu_init(); + fpu__init_cpu(); + rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -1674,10 +1687,10 @@ static bool wakeup_cpu0(void) */ static inline void mwait_play_dead(void) { + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - void *mwait_ptr; int i; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || @@ -1712,13 +1725,6 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } - /* - * This should be a memory location in a cache line which is - * unlikely to be touched by other processors. The actual - * content is immaterial as it is not actually modified in any way. - */ - mwait_ptr = ¤t_thread_info()->flags; - wbinvd(); while (1) { @@ -1730,9 +1736,9 @@ static inline void mwait_play_dead(void) * case where we return around the loop. */ mb(); - clflush(mwait_ptr); + clflush(md); mb(); - __monitor(mwait_ptr, 0, 0); + __monitor(md, 0, 0); mb(); __mwait(eax, 0); /* diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..2f97d1a1032f 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -175,8 +175,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index f7476ce23b6e..42e31358a9d3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -70,9 +70,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -82,11 +79,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 653b7f617b61..fff04d285976 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -264,6 +264,22 @@ static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { "Lenovo ideapad D330-10IGM"), }, }, + { + /* Lenovo IdeaPad Duet 3 10IGL5 with 1200x1920 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "IdeaPad Duet 3 10IGL5"), + }, + }, + { + /* Lenovo Yoga Book X91F / X91L */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + /* Non exact match to match F + L versions */ + DMI_MATCH(DMI_PRODUCT_NAME, "Lenovo YB1-X91"), + }, + }, {}, }; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index b934f9f68a16..c85634152d30 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -90,22 +90,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) @@ -682,7 +687,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index fae5b00cbccf..f51fc7fde3a0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } +setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1afe211d7a7c..09506e492f1d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,8 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define RUNTIME_DISCARD_EXIT + #include <asm-generic/vmlinux.lds.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -381,7 +383,7 @@ SECTIONS .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ + *(.bss..brk) /* areas brk users have reserved */ __brk_limit = .; } |