diff options
Diffstat (limited to 'arch/x86')
125 files changed, 2775 insertions, 832 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be4403a8e1b4..9fd2e3c2494a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,7 @@ config X86 select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -2437,6 +2438,25 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y depends on X86_64 || X86_PAE +config GDS_FORCE_MITIGATION + bool "Force GDS Mitigation" + depends on CPU_SUP_INTEL + default n + help + Gather Data Sampling (GDS) is a hardware vulnerability which allows + unprivileged speculative access to data which was previously stored in + vector registers. + + This option is equivalent to setting gather_data_sampling=force on the + command line. The microcode mitigation is used if present, otherwise + AVX is disabled as a mitigation. On affected systems that are missing + the microcode any userspace code that unconditionally uses AVX will + break with this option set. + + Setting this option on systems not vulnerable to GDS has no effect. + + If in doubt, say N. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on X86_64 && HUGETLB_PAGE && MIGRATION diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6539c50fb9aa..82500962f1b3 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h -LDFLAGS_setup.elf := -m elf_i386 -T +LDFLAGS_setup.elf := -m elf_i386 -z noexecstack -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index d401b4a262b0..a860390852ff 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -35,7 +35,7 @@ intcall: movw %dx, %si movw %sp, %di movw $11, %cx - rep; movsd + rep; movsl /* Pop full state from the stack */ popal @@ -70,7 +70,7 @@ intcall: jz 4f movw %sp, %si movw $11, %cx - rep; movsd + rep; movsl 4: addw $44, %sp /* Restore state and return */ diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef5a9cc66fb8..a4c5fb92b1cb 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -114,66 +114,78 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdfs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdfs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdgs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdgs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5642f025b397..23b6e2da72bf 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,6 +57,10 @@ else KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ && echo "-z noreloc-overflow -pie --no-dynamic-linker") endif + +KBUILD_LDFLAGS += -z noexecstack +KBUILD_LDFLAGS += $(call ld-option,--no-warn-rwx-segments) + LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 9bcea386db65..98b4ff1c51f3 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -34,7 +34,7 @@ static void copy_boot_params(void) u16 cl_offset; }; const struct old_cmdline * const oldcmd = - (const struct old_cmdline *)OLD_CL_ADDRESS; + absolute_pointer(OLD_CL_ADDRESS); BUILD_BUG_ON(sizeof boot_params != 4096); memcpy(&boot_params.hdr, &hdr, sizeof hdr); diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 993dd06c8923..806729a7172f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,8 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/nospec-branch.h> /* @@ -146,27 +148,19 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro POP_REGS pop_rdi=1 skip_r11rcx=0 +.macro POP_REGS pop_rdi=1 popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx - .if \skip_r11rcx - popq %rsi - .else popq %r11 - .endif popq %r10 popq %r9 popq %r8 popq %rax - .if \skip_r11rcx - popq %rsi - .else popq %rcx - .endif popq %rdx popq %rsi .if \pop_rdi @@ -317,6 +311,62 @@ For 32-bit we have the following conventions - kernel is built with #endif /* + * IBRS kernel mitigation for Spectre_v2. + * + * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers + * the regs it uses (AX, CX, DX). Must be called before the first RET + * instruction (NOTE! UNTRAIN_RET includes a RET instruction) + * + * The optional argument is used to save/restore the current value, + * which is used on the paranoid paths. + * + * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. + */ +.macro IBRS_ENTER save_reg + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS + movl $MSR_IA32_SPEC_CTRL, %ecx + +.ifnb \save_reg + rdmsr + shl $32, %rdx + or %rdx, %rax + mov %rax, \save_reg + test $SPEC_CTRL_IBRS, %eax + jz .Ldo_wrmsr_\@ + lfence + jmp .Lend_\@ +.Ldo_wrmsr_\@: +.endif + + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx + movl %edx, %eax + shr $32, %rdx + wrmsr +.Lend_\@: +.endm + +/* + * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX) + * regs. Must be called after the last RET. + */ +.macro IBRS_EXIT save_reg + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS + movl $MSR_IA32_SPEC_CTRL, %ecx + +.ifnb \save_reg + mov \save_reg, %rdx +.else + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx + andl $(~SPEC_CTRL_IBRS), %edx +.endif + + movl %edx, %eax + shr $32, %rdx + wrmsr +.Lend_\@: +.endm + +/* * Mitigate Spectre v1 for conditional swapgs code paths. * * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 37d9016d4768..e6c258bf9511 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -643,7 +643,6 @@ ENTRY(__switch_to_asm) movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif -#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated @@ -652,7 +651,6 @@ ENTRY(__switch_to_asm) * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW -#endif /* restore callee-saved registers */ popfl @@ -1502,13 +1500,13 @@ ENTRY(async_page_fault) END(async_page_fault) #endif -ENTRY(rewind_stack_do_exit) +ENTRY(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movl PER_CPU_VAR(cpu_current_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp - call do_exit + call make_task_dead 1: jmp 1b -END(rewind_stack_do_exit) +END(rewind_stack_and_make_dead) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ced052ddc4d7..bf08de4232e3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -235,6 +235,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi + + /* clobbers %rax, make sure it is after saving the syscall nr */ + IBRS_ENTER + call do_syscall_64 /* returns with IRQs disabled */ TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -311,8 +315,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + IBRS_EXIT + POP_REGS pop_rdi=0 /* * Now all regs are restored except RSP and RDI. @@ -363,7 +367,6 @@ ENTRY(__switch_to_asm) movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif -#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated @@ -372,7 +375,6 @@ ENTRY(__switch_to_asm) * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW -#endif /* restore callee-saved registers */ popfq @@ -685,6 +687,7 @@ GLOBAL(retint_user) TRACE_IRQS_IRETQ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + IBRS_EXIT #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testb $3, CS(%rsp) @@ -1250,7 +1253,13 @@ ENTRY(paranoid_entry) */ FENCE_SWAPGS_KERNEL_ENTRY - ret + /* + * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like + * CR3 above, keep the old value in a callee saved register. + */ + IBRS_ENTER save_reg=%r15 + + RET END(paranoid_entry) /* @@ -1278,12 +1287,20 @@ ENTRY(paranoid_exit) jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + + /* + * Must restore IBRS state before both CR3 and %GS since we need access + * to the per-CPU x86_spec_ctrl_shadow variable. + */ + IBRS_EXIT save_reg=%r15 + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel END(paranoid_exit) + /* * Save all registers in pt_regs, and switch GS if needed. */ @@ -1303,6 +1320,7 @@ ENTRY(error_entry) FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER .Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ @@ -1367,6 +1385,7 @@ ENTRY(error_entry) SWAPGS FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER /* * Pretend that the exception came from user mode: set up pt_regs @@ -1472,6 +1491,8 @@ ENTRY(nmi) PUSH_AND_CLEAR_REGS rdx=(%rdx) ENCODE_FRAME_POINTER + IBRS_ENTER + /* * At this point we no longer need to worry about stack damage * due to nesting -- we're on the normal thread stack and we're @@ -1695,6 +1716,9 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ + IBRS_EXIT save_reg=%r15 + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 @@ -1735,10 +1759,10 @@ END(nmi) ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax - sysret + sysretl END(ignore_sysret) -ENTRY(rewind_stack_do_exit) +ENTRY(rewind_stack_and_make_dead) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1747,5 +1771,5 @@ ENTRY(rewind_stack_do_exit) leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS - call do_exit -END(rewind_stack_do_exit) + call make_task_dead +END(rewind_stack_and_make_dead) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 40d2834a8101..85dd05de648c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -4,7 +4,6 @@ * * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include "calling.h" #include <asm/asm-offsets.h> #include <asm/current.h> #include <asm/errno.h> @@ -17,6 +16,8 @@ #include <linux/linkage.h> #include <linux/err.h> +#include "calling.h" + .section .entry.text, "ax" /* @@ -106,6 +107,8 @@ ENTRY(entry_SYSENTER_compat) xorl %r15d, %r15d /* nospec r15 */ cld + IBRS_ENTER + /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -253,6 +256,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) */ TRACE_IRQS_OFF + IBRS_ENTER + movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -262,6 +267,9 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) /* Opportunistic SYSRET */ sysret32_from_system_call: TRACE_IRQS_ON /* User mode traces as IRQs on. */ + + IBRS_EXIT + movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ @@ -403,6 +411,7 @@ ENTRY(entry_INT80_compat) * gate turned them off. */ TRACE_IRQS_OFF + IBRS_ENTER movq %rsp, %rdi call do_int80_syscall_32 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 5bfe2243a08f..ec5d8d0bd222 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -172,7 +172,7 @@ quiet_cmd_vdso = VDSO $@ VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \ - -Bsymbolic + -Bsymbolic -z noexecstack GCOV_PROFILE := n # diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 46caca4d9141..a3cd828359f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -228,8 +228,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Round the lowest possible end address up to a PMD boundary. */ end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; + if (end >= DEFAULT_MAP_WINDOW) + end = DEFAULT_MAP_WINDOW; end -= len; if (end > start) { @@ -329,7 +329,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) static __init int vdso_setup(char *s) { vdso64_enabled = simple_strtoul(s, NULL, 0); - return 0; + return 1; } __setup("vdso=", vdso_setup); #endif diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2d9e1372b070..d157d0adef06 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -324,6 +324,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -693,6 +703,14 @@ fail: data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 849f0ba53a9b..49b3ea1c1ea1 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -460,7 +460,7 @@ static u64 pt_config_filters(struct perf_event *event) pt->filters.filter[range].msr_b = filter->msr_b; } - rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; + rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; } return rtit_ctl; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2bf1170f7afd..34da6d27d839 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2699,6 +2699,7 @@ static bool hswep_has_limit_sbox(unsigned int device) return false; pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + pci_dev_put(dev); if (!hswep_get_chop(capid4)) return true; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 3ebd77770f98..4cfda3cfae7f 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm) set_personality_ia32(false); setup_new_exec(bprm); + install_exec_creds(bprm); regs->cs = __USER32_CS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = @@ -314,8 +315,6 @@ static int load_aout_binary(struct linux_binprm *bprm) if (retval < 0) return retval; - install_exec_creds(bprm); - if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 1b010a859b8b..6de59a4f723c 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -16,7 +16,19 @@ /* Asm macros */ -#define ACPI_FLUSH_CPU_CACHE() wbinvd() +/* + * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states. + * It is required to prevent data loss. + * + * While running inside virtual machine, the kernel can bypass cache flushing. + * Changing sleep state in a virtual machine doesn't affect the host system + * sleep state and cannot lead to data loss. + */ +#define ACPI_FLUSH_CPU_CACHE() \ +do { \ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \ + wbinvd(); \ +} while (0) int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 3ac991d81e74..4d3cac3c9b25 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -86,10 +86,6 @@ static inline bool rdseed_int(unsigned int *v) return ok; } -/* Conditional execution based on CPU type */ -#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) -#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) - /* * These are the generic interfaces; they must not be declared if the * stubs in <linux/random.h> are to be invoked, @@ -99,22 +95,22 @@ static inline bool rdseed_int(unsigned int *v) static inline bool arch_get_random_long(unsigned long *v) { - return arch_has_random() ? rdrand_long(v) : false; + return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false; } static inline bool arch_get_random_int(unsigned int *v) { - return arch_has_random() ? rdrand_int(v) : false; + return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false; } static inline bool arch_get_random_seed_long(unsigned long *v) { - return arch_has_random_seed() ? rdseed_long(v) : false; + return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false; } static inline bool arch_get_random_seed_int(unsigned int *v) { - return arch_has_random_seed() ? rdseed_int(v) : false; + return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; } extern void x86_init_rdrand(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 542509b53e0f..dd8ff1ca2aef 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -4,8 +4,6 @@ #include <asm/processor.h> -extern void check_bugs(void); - #if defined(CONFIG_CPU_SUP_INTEL) void check_mpx_erratum(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index fb97cf7c4137..1def972b6ca3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -46,15 +46,13 @@ typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; + u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; + u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 884466592943..cdf39decf734 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -1,16 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _CPU_DEVICE_ID -#define _CPU_DEVICE_ID 1 +#ifndef _ASM_X86_CPU_DEVICE_ID +#define _ASM_X86_CPU_DEVICE_ID /* * Declare drivers belonging to specific x86 CPUs * Similar in spirit to pci_device_id and related PCI functions + * + * The wildcard initializers are in mod_devicetable.h because + * file2alias needs them. Sigh. */ - #include <linux/mod_devicetable.h> +/* Get the INTEL_FAM* model defines */ +#include <asm/intel-family.h> +/* And the X86_VENDOR_* ones */ +#include <asm/processor.h> -#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) +/* Centaur FAM6 models */ +#define X86_CENTAUR_FAM6_C7_A 0xa +#define X86_CENTAUR_FAM6_C7_D 0xd +#define X86_CENTAUR_FAM6_NANO 0xf +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY @@ -23,8 +33,11 @@ * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. * - * Backport version to keep the SRBDS pile consistant. No shorter variants - * required for this. + * Use only if you need all selectors. Otherwise use one of the shorter + * macros of the X86_MATCH_* family. If there is no matching shorthand + * macro, consider to add one. If you really need to wrap one of the macros + * into another macro at the usage site for good reasons, then please + * start this local macro with X86_MATCH to allow easy grepping. */ #define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ _steppings, _feature, _data) { \ @@ -36,6 +49,147 @@ .driver_data = (unsigned long) _data \ } +/** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ + X86_STEPPING_ANY, feature, data) + +/** + * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ + X86_MODEL_ANY, feature, data) + +/** + * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ + X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) + +/** + * X86_MATCH_FEATURE - Macro for matching a CPU feature + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_FEATURE(feature, data) \ + X86_MATCH_VENDOR_FEATURE(ANY, feature, data) + +/* Transitional to keep the existing code working */ +#define X86_FEATURE_MATCH(feature) X86_MATCH_FEATURE(feature, NULL) + +/** + * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @model: The model number, model constant or X86_MODEL_ANY + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ + X86_FEATURE_ANY, data) + +/** + * X86_MATCH_VENDOR_FAM - Match vendor and family + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@vendor + * @family: The family number or X86_FAMILY_ANY + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are + * set of wildcards. + */ +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) + +/** + * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model + * @model: The model name without the INTEL_FAM6_ prefix or ANY + * The model name is expanded to INTEL_FAM6_@model internally + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The vendor is set to INTEL, the family to 6 and all other missing + * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. + * + * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. + */ +#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) + +/* + * Match specific microcode revisions. + * + * vendor/family/model/stepping must be all set. + * + * Only checks against the boot CPU. When mixed-stepping configs are + * valid for a CPU model, add a quirk for every valid stepping and + * do the fine-tuning in the quirk handler. + */ + +struct x86_cpu_desc { + u8 x86_family; + u8 x86_vendor; + u8 x86_model; + u8 x86_stepping; + u32 x86_microcode_rev; +}; + +#define INTEL_CPU_DESC(model, stepping, revision) { \ + .x86_family = 6, \ + .x86_vendor = X86_VENDOR_INTEL, \ + .x86_model = (model), \ + .x86_stepping = (stepping), \ + .x86_microcode_rev = (revision), \ +} + extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); +extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); -#endif +#endif /* _ASM_X86_CPU_DEVICE_ID */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f7f9604b10cc..5cf8dca571cf 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -202,8 +202,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ @@ -283,6 +283,16 @@ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ +/* FREE! (11*32+ 6) */ +/* FREE! (11*32+ 7) */ +/* FREE! (11*32+ 8) */ +/* FREE! (11*32+ 9) */ +/* FREE! (11*32+10) */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+18) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -295,6 +305,7 @@ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -394,5 +405,10 @@ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_GDS X86_BUG(29) /* CPU is affected by Gather Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4d90f4b64437..49decc8339b3 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -42,7 +42,7 @@ extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); extern u64 fpu__get_supported_xfeatures_mask(void); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 00e01d215f74..e1bd23641fc6 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -682,7 +682,7 @@ struct hv_enlightened_vmcs { u64 guest_rip; u32 hv_clean_fields; - u32 hv_padding_32; + u32 padding32_1; u32 hv_synthetic_controls; struct { u32 nested_flush_hypercall:1; @@ -690,7 +690,7 @@ struct hv_enlightened_vmcs { u32 reserved:30; } hv_enlightenments_control; u32 hv_vp_id; - + u32 padding32_2; u64 hv_vm_id; u64 partition_assist_page; u64 padding64_4[4]; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 89789e8c80f6..e16574c16e93 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -67,6 +67,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 2a8e5f78e23c..ccf07426a84d 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -13,6 +13,9 @@ * that group keep the CPUID for the variants sorted by model number. */ +/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ +#define INTEL_FAM6_ANY X86_MODEL_ANY + #define INTEL_FAM6_CORE_YONAH 0x0E #define INTEL_FAM6_CORE2_MEROM 0x0F @@ -64,6 +67,19 @@ #define INTEL_FAM6_COMETLAKE 0xA5 #define INTEL_FAM6_COMETLAKE_L 0xA6 +#define INTEL_FAM6_ROCKETLAKE 0xA7 + +/* Hybrid Core/Atom Processors */ + +#define INTEL_FAM6_LAKEFIELD 0x8A +#define INTEL_FAM6_ALDERLAKE 0x97 +#define INTEL_FAM6_ALDERLAKE_L 0x9A +#define INTEL_FAM6_ALDERLAKE_N 0xBE + +#define INTEL_FAM6_RAPTORLAKE 0xB7 +#define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_FAM6_RAPTORLAKE_S 0xBF + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ @@ -83,13 +99,19 @@ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ + #define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ +#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ +#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +/* Family 5 */ +#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ + /* Useful macros */ #define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ { \ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5125fca472bb..0a41c62369f2 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -21,6 +21,7 @@ #ifndef __ASSEMBLY__ #include <linux/string.h> +#include <linux/module.h> #include <asm/page.h> #include <asm/ptrace.h> @@ -217,6 +218,14 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 616f8e637bc3..b51ac6eed904 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -80,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void mem_encrypt_init(void) { } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2b7cc5397f80..b675db12a8ab 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -5,6 +5,7 @@ #include <asm/cpu.h> #include <linux/earlycpio.h> #include <linux/initrd.h> +#include <asm/microcode_amd.h> struct ucode_patch { struct list_head plist; @@ -130,14 +131,16 @@ static inline unsigned int x86_cpuid_family(void) int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); -void reload_early_microcode(void); +void reload_early_microcode(unsigned int cpu); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); extern bool initrd_gone; +void microcode_bsp_resume(void); #else static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } -static inline void reload_early_microcode(void) { } +static inline void reload_early_microcode(unsigned int cpu) { } +static inline void microcode_bsp_resume(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 5c524d4f71cd..403a8e76b310 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -47,12 +47,14 @@ struct microcode_amd { extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(unsigned int family); extern int __init save_microcode_in_initrd_amd(unsigned int family); -void reload_ucode_amd(void); +void reload_ucode_amd(unsigned int cpu); +extern void amd_check_microcode(void); #else static inline void __init load_ucode_amd_bsp(unsigned int family) {} static inline void load_ucode_amd_ap(unsigned int family) {} static inline int __init save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } -void reload_ucode_amd(void) {} +static inline void reload_ucode_amd(unsigned int cpu) {} +static inline void amd_check_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 892af8ab95d7..4be36bfb2477 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -47,6 +47,12 @@ #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) + +/* A mask for bits which the kernel toggles when controlling mitigations */ +#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ + | SPEC_CTRL_RRSBA_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -73,6 +79,7 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO BIT(4) /* * Not susceptible to Speculative Store Bypass @@ -96,6 +103,50 @@ * Not susceptible to * TSX Async Abort (TAA) vulnerabilities. */ +#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /* + * Not susceptible to SBDR and SSDP + * variants of Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FBSDP_NO BIT(14) /* + * Not susceptible to FBSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_PSDP_NO BIT(15) /* + * Not susceptible to PSDP variant of + * Processor MMIO stale data + * vulnerabilities. + */ +#define ARCH_CAP_FB_CLEAR BIT(17) /* + * VERW clears CPU fill buffer + * even on MDS_NO CPUs. + */ +#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /* + * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS] + * bit available to control VERW + * behavior. + */ +#define ARCH_CAP_RRSBA BIT(19) /* + * Indicates RET may use predictors + * other than the RSB. With eIBRS + * enabled predictions in kernel mode + * are restricted to targets in + * kernel. + */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */ +#define ARCH_CAP_GDS_CTRL BIT(25) /* + * CPU is vulnerable to Gather + * Data Sampling (GDS) and + * has controls for mitigation. + */ +#define ARCH_CAP_GDS_NO BIT(26) /* + * CPU is not vulnerable to Gather + * Data Sampling (GDS). + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -113,6 +164,9 @@ /* SRBDS support */ #define MSR_IA32_MCU_OPT_CTRL 0x00000123 #define RNGDS_MITG_DIS BIT(0) +#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ +#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */ +#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 @@ -360,6 +414,13 @@ #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) +#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9 + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -386,6 +447,10 @@ #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -428,9 +493,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 62f9903544b5..8dba996e9f9e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -9,6 +9,7 @@ #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> +#include <asm/percpu.h> /* * Fill the CPU return stack buffer. @@ -35,6 +36,7 @@ * the optimal version — two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ +#ifdef CONFIG_X86_64 #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ @@ -52,7 +54,29 @@ 774: \ dec reg; \ jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; \ + /* barrier for jnz misprediction */ \ + lfence; +#else +/* + * i386 doesn't unconditionally have LFENCE, as such it can't + * do a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + .rept nr; \ + call 772f; \ + int3; \ +772:; \ + .endr; \ add $(BITS_PER_LONG/8) * nr, sp; +#endif + +#define ISSUE_UNBALANCED_RET_GUARD(sp) \ + call 992f; \ + int3; \ +992: \ + add $(BITS_PER_LONG/8), sp; \ + lfence; #ifdef __ASSEMBLY__ @@ -119,7 +143,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_LFENCE #else jmp *\reg #endif @@ -130,7 +154,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_LFENCE #else call *\reg #endif @@ -141,13 +165,11 @@ * monstrosity above, manually. */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req -#ifdef CONFIG_RETPOLINE ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE "jmp .Lskip_rsb_\@", \ __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ \ftr .Lskip_rsb_\@: -#endif .endm #else /* __ASSEMBLY__ */ @@ -181,7 +203,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -211,7 +233,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -223,9 +245,12 @@ /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_GENERIC, - SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS_ENHANCED, + SPECTRE_V2_RETPOLINE, + SPECTRE_V2_LFENCE, + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, + SPECTRE_V2_IBRS, }; /* The indirect branch speculation control variants */ @@ -254,15 +279,17 @@ extern char __indirect_thunk_end[]; * retpoline and IBRS mitigations for Spectre v2 need this; only on future * CPUs with IBRS_ALL *might* it be avoided. */ -static inline void vmexit_fill_RSB(void) +static __always_inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE unsigned long loops; asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) + ALTERNATIVE_2("jmp 910f", "", X86_FEATURE_RSB_VMEXIT, + "jmp 911f", X86_FEATURE_RSB_VMEXIT_LITE) + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)) + "911:" + __stringify(ISSUE_UNBALANCED_RET_GUARD(%1)) "910:" : "=r" (loops), ASM_CALL_CONSTRAINT : : "memory" ); @@ -289,6 +316,9 @@ static inline void indirect_branch_prediction_barrier(void) /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; +DECLARE_PER_CPU(u64, x86_spec_ctrl_current); +extern void update_spec_ctrl_cond(u64 val); +extern u64 spec_ctrl_current(void); /* * With retpoline, we must use IBRS to restrict branch prediction @@ -298,18 +328,16 @@ extern u64 x86_spec_ctrl_base; */ #define firmware_restrict_branch_speculation_start() \ do { \ - u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ - \ preempt_disable(); \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ + spec_ctrl_current() | SPEC_CTRL_IBRS, \ X86_FEATURE_USE_IBRS_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ - u64 val = x86_spec_ctrl_base; \ - \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ + spec_ctrl_current(), \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) @@ -321,6 +349,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); +DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); + #include <asm/segment.h> /** diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bbfde3d2662f..4bcd9d0c7bee 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -11,13 +11,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cc4bb218f1c6..f3049d55c522 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1001,4 +1001,6 @@ enum taa_mitigations { TAA_MITIGATION_TSX_DISABLED, }; +extern bool gds_ucode_mitigated(void); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index a671a1145906..9177b4354c3f 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +void cpu_emergency_disable_virtualization(void); + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); void run_crash_ipi_callback(struct pt_regs *regs); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ae13bc974416..c5ed79159975 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -91,27 +91,16 @@ extern unsigned long _brk_end; void *extend_brk(size_t size, size_t align); /* - * Reserve space in the brk section. The name must be unique within - * the file, and somewhat descriptive. The size is in bytes. Must be - * used at file scope. + * Reserve space in the .brk section, which is a block of memory from which the + * caller is allowed to allocate very early (before even memblock is available) + * by calling extend_brk(). All allocated memory will be eventually converted + * to memblock. Any leftover unallocated memory will be freed. * - * (This uses a temp function to wrap the asm so we can pass it the - * size parameter; otherwise we wouldn't be able to. We can't use a - * "section" attribute on a normal variable because it always ends up - * being @progbits, which ends up allocating space in the vmlinux - * executable.) + * The size is in bytes. */ -#define RESERVE_BRK(name,sz) \ - static void __section(.discard.text) __used notrace \ - __brk_reservation_fn_##name##__(void) { \ - asm volatile ( \ - ".pushsection .brk_reservation,\"aw\",@nobits;" \ - ".brk." #name ":" \ - " 1:.skip %c0;" \ - " .size .brk." #name ", . - 1b;" \ - " .popsection" \ - : : "i" (sz)); \ - } +#define RESERVE_BRK(name, size) \ + __section(.bss..brk) __aligned(1) __used \ + static char __brk_##name[size] /* Helper for reserving space for arrays of things */ #define RESERVE_BRK_ARRAY(type, name, entries) \ @@ -129,12 +118,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ -#else -#define RESERVE_BRK(name,sz) \ - .pushsection .brk_reservation,"aw",@nobits; \ -.brk.name: \ -1: .skip sz; \ - .size .brk.name,.-1b; \ + +#else /* __ASSEMBLY */ + +.macro __RESERVE_BRK name, size + .pushsection .bss..brk, "aw" +GLOBAL(__brk_\name) + .skip \size +END(__brk_\name) .popsection +.endm + +#define RESERVE_BRK(name, size) __RESERVE_BRK name, size + #endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 8be6afb58471..32662cbaa27e 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -21,7 +21,6 @@ struct saved_context { #endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; struct desc_ptr gdt_desc; struct desc_ptr idt; @@ -30,6 +29,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); #endif /* _ASM_X86_SUSPEND_32_H */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index a7af9f53c0cb..b2861400c6a2 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -14,9 +14,13 @@ * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. * - * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that - * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c, - * still work as required. + * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S + * and make sure that __save/__restore_processor_state(), defined in + * arch/x86/power/cpu.c, still work as required. + * + * Because the structure is packed, make sure to avoid unaligned members. For + * optimisation purposes but also because tools like kmemleak only search for + * pointers that are aligned. */ struct saved_context { struct pt_regs regs; @@ -36,7 +40,6 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; unsigned long efer; u16 gdt_pad; /* Unused */ @@ -48,6 +51,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); #define loaddebug(thread,register) \ diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index a4a8b1b16c0c..956e4145311b 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -5,6 +5,15 @@ #include <asm/processor.h> #include <asm/tsc.h> +static inline unsigned long random_get_entropy(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) + return random_get_entropy_fallback(); + return rdtsc(); +} +#define random_get_entropy random_get_entropy + /* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index eb5bbfeccb66..196cf01f58fd 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -22,13 +22,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { -#ifndef CONFIG_X86_TSC - if (!boot_cpu_has(X86_FEATURE_TSC)) + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; -#endif - return rdtsc(); } +#define get_cycles get_cycles extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 0116b2ee9e64..62810550024d 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -89,12 +89,6 @@ static inline int cpu_has_svm(const char **msg) return 0; } - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; @@ -114,7 +108,21 @@ static inline void cpu_svm_disable(void) wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM to ensure INIT and NMI + * aren't blocked, e.g. if a fatal error occurred between CLGI + * and STGI. Note, STGI may #UD if SVM is disabled from NMI + * context between reading EFER and executing STGI. In that + * case, GIF must already be set, otherwise the NMI would have + * been blocked, so just eat the fault. + */ + asm_volatile_goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); +fault: + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } } /** Makes sure SVM is disabled, if it is supported on the CPU diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b35c34cfbe52..2f0fa294d617 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -156,6 +156,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); @@ -1351,6 +1354,17 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d) return 0; } +static int __init disable_acpi_xsdt(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=rsdt\n", d->ident); + acpi_gbl_do_not_use_xsdt = TRUE; + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n"); + } + return 0; +} + static int __init dmi_disable_acpi(const struct dmi_system_id *d) { if (!acpi_force) { @@ -1475,6 +1489,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + /* + * Boxes that need ACPI XSDT use disabled due to corrupted tables + */ + { + .callback = disable_acpi_xsdt, + .ident = "Advantech DAC-BJ01", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.12"), + DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"), + }, + }, {} }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 918a23704c0c..33882d5ab25d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -366,6 +366,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -426,6 +437,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insnbuf, insnbuf_sz); } + + kasan_enable_current(); } #ifdef CONFIG_SMP @@ -677,8 +690,8 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, } else { local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9791828f3fcd..9318fe7d850e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -166,7 +166,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -403,10 +403,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; - rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); - } while (rsvd != new); + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); - rsvd &= ~APIC_EILVT_MASKED; + rsvd = new & ~APIC_EILVT_MASKED; if (rsvd && rsvd != vector) pr_info("LVT offset %d assigned for vector 0x%02x\n", offset, rsvd); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 677508baf95a..af59aa9c5523 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2449,17 +2449,21 @@ static int io_apic_get_redir_entries(int ioapic) unsigned int arch_dynirq_lower_bound(unsigned int from) { + unsigned int ret; + /* * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - if (!ioapic_initialized) - return gsi_top; + ret = ioapic_dynirq_base ? : gsi_top; + /* - * For DT enabled machines ioapic_dynirq_base is irrelevant and not - * updated. So simply return @from if ioapic_dynirq_base == 0. + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. */ - return ioapic_dynirq_base ? : from; + return ret ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8e70c2ba21b3..fb17767552ef 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -102,7 +102,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f7151cd03cb0..3d7a8049f637 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -247,12 +247,6 @@ extern int (*console_blank_hook)(int); #endif /* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 98c23126f751..84667781c41d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -23,11 +23,6 @@ #include "cpu.h" -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static const int amd_erratum_1054[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); - /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -35,6 +30,83 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); */ static u32 nodes_per_socket = 1; +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_stepping; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -199,6 +271,15 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } #endif + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); } static void init_amd_k7(struct cpuinfo_x86 *c) @@ -789,8 +870,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -885,12 +964,62 @@ static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. - */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } +} + +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107a; break; + case 0x60 ... 0x67: good_rev = 0x0860010b; break; + case 0x68 ... 0x6f: good_rev = 0x08608105; break; + case 0x70 ... 0x7f: good_rev = 0x08701032; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -951,16 +1080,16 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* * Verify that the MSR write was successful (could be running * under a hypervisor) and only then assume that LFENCE is * serializing. */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + ret = rdmsrl_safe(MSR_AMD64_DE_CFG, &val); + if (!ret && (val & MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)) { /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } else { @@ -995,6 +1124,12 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); + + zenbleed_check(c); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } #ifdef CONFIG_X86_32 @@ -1090,73 +1225,6 @@ static const struct cpu_dev amd_cpu_dev = { cpu_dev_register(amd_cpu_dev); -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - void set_dr_addr_mask(unsigned long mask, int dr) { if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -1175,3 +1243,18 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c524fa1f4c0e..6e1acbdd27a5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -9,7 +9,6 @@ * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> -#include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> @@ -25,34 +24,69 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> +#include <linux/bpf.h> #include "cpu.h" static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); -static void __init mds_print_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init gds_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void update_spec_ctrl_cond(u64 val) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -75,107 +109,61 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); -void __init check_bugs(void) -{ - identify_boot_cpu(); - - /* - * identify_boot_cpu() initialized SMT support information, let the - * core code know. - */ - cpu_smt_check_topology(); - - if (!IS_ENABLED(CONFIG_SMP)) { - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); - } +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +void __init cpu_select_mitigations(void) +{ /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ - if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); - srbds_select_mitigation(); - - /* - * As MDS and TAA mitigations are inter-related, print MDS - * mitigation until after TAA mitigation selection is done. - */ - mds_print_mitigation(); - - arch_smt_update(); - -#ifdef CONFIG_X86_32 /* - * Check whether we are able to run this kernel safely on SMP. - * - * - i386 is no longer supported. - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. */ - if (boot_cpu_data.x86 < 4) - panic("Kernel requires i486+ for 'invlpg' and other features"); - - init_utsname()->machine[1] = - '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); - alternative_instructions(); - - fpu__init_check_bugs(); -#else /* CONFIG_X86_64 */ - alternative_instructions(); - + retbleed_select_mitigation(); /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET. */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -#endif + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); + srbds_select_mitigation(); + gds_select_mitigation(); } +/* + * NOTE: For VMX, this function is not called in the vmexit path. + * It uses vmx_spec_ctrl_restore_host() instead. + */ void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - if (hostval != guestval) { msrval = setguest ? guestval : hostval; wrmsrl(MSR_IA32_SPEC_CTRL, msrval); @@ -256,14 +244,6 @@ static void __init mds_select_mitigation(void) } } -static void __init mds_print_mitigation(void) -{ - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) - return; - - pr_info("%s\n", mds_strings[mds_mitigation]); -} - static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) @@ -311,7 +291,7 @@ static void __init taa_select_mitigation(void) /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; - goto out; + return; } if (cpu_mitigations_off()) { @@ -325,7 +305,7 @@ static void __init taa_select_mitigation(void) */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) - goto out; + return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; @@ -357,18 +337,6 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); - - /* - * Update MDS mitigation, if necessary, as the mds_user_clear is - * now enabled for TAA mitigation. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } -out: - pr_info("%s\n", taa_strings[taa_mitigation]); } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -393,6 +361,154 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { @@ -453,11 +569,13 @@ static void __init srbds_select_mitigation(void) return; /* - * Check to see if this is one of the MDS_NO systems supporting - * TSX that are only exposed to SRBDS when TSX is enabled. + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. */ ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; @@ -484,6 +602,149 @@ static int __init srbds_parse_cmdline(char *str) early_param("srbds", srbds_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_FORCE: + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } + goto out; + } + + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -575,12 +836,103 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, +}; + +const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + retbleed_cmd = RETBLEED_CMD_OFF; + else if (!strcmp(str, "auto")) + retbleed_cmd = RETBLEED_CMD_AUTO; + else + pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str); + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_AUTO: + default: + /* + * The Intel mitigation (IBRS) was already selected in + * spectre_v2_select_mitigation(). + */ + + break; + } + + switch (retbleed_mitigation) { + default: + break; + } + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = @@ -607,6 +959,33 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -621,7 +1000,11 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -662,13 +1045,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -694,8 +1079,20 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} + +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; +} + static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -708,7 +1105,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; @@ -756,10 +1153,19 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not - * required. + * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP + * is not required. + * + * Enhanced IBRS also protects against cross-thread branch target + * injection in user-mode as the IBRS bit remains always set which + * implicitly enables cross-thread protections. However, in legacy IBRS + * mode, the IBRS bit is set only on kernel entry and cleared on return + * to userspace. This disables the implicit cross-thread protection, + * so allow for STIBP to be selected in that case. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return; /* @@ -771,12 +1177,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: @@ -785,9 +1185,12 @@ set_mode: static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -798,9 +1201,14 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -836,16 +1244,48 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -854,6 +1294,80 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -874,85 +1388,161 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + + mode = spectre_v2_select_retpoline(); break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - retpoline_amd: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_ibrs_mode(mode)) { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + /* fallthrough */ + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -987,6 +1577,8 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { + u64 ia32_cap = x86_read_arch_cap_msr(); + /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -998,19 +1590,26 @@ static void update_mds_branch_idle(void) if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; - if (sched_smt_active()) + if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); - else + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); + } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void arch_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1046,6 +1645,16 @@ void arch_smt_update(void) break; } + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1150,16 +1759,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1176,7 +1775,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1255,7 +1854,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; - /* * With strict mode for both IBPB and STIBP, the instruction * code paths avoid checking this task flag and instead, @@ -1295,6 +1893,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); break; default: return -ERANGE; @@ -1382,7 +1982,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1598,9 +2198,26 @@ static ssize_t tsx_async_abort_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { @@ -1630,11 +2247,56 @@ static char *ibpb_state(void) return ""; } +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sprintf(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); } +static ssize_t retbleed_show_state(char *buf) +{ + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1655,12 +2317,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); @@ -1682,6 +2339,16 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SRBDS: return srbds_show_state(buf); + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + + case X86_BUG_GDS: + return gds_show_state(buf); + default: break; } @@ -1733,4 +2400,22 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7e7400af7797..d315e928b95c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,14 +13,20 @@ #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> +#include <linux/sched/smt.h> #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> +#include <linux/mem_encrypt.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/syscore_ops.h> #include <asm/stackprotector.h> +#include <linux/utsname.h> + +#include <asm/alternative.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/archrandom.h> @@ -56,6 +62,7 @@ #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> #endif +#include <asm/set_memory.h> #include "cpu.h" @@ -954,6 +961,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define MSBDS_ONLY BIT(5) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) +#define NO_MMIO BIT(8) +#define NO_EIBRS_PBRSB BIT(9) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -971,6 +980,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -988,9 +1002,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1000,37 +1014,78 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) +/* CPU is affected by GDS */ +#define GDS BIT(6) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_XEON_D,X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0xC), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0xD), SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_MOBILE,X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_MOBILE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_XEON_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), {} }; @@ -1051,6 +1106,13 @@ u64 x86_read_arch_cap_msr(void) return ia32_cap; } +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1102,12 +1164,53 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * SRBDS affects CPUs which support RDRAND or RDSEED and are listed * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. */ if ((cpu_has(c, X86_FEATURE_RDRAND) || cpu_has(c, X86_FEATURE_RDSEED)) && - cpu_matches(cpu_vuln_blacklist, SRBDS)) + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) setup_force_cpu_bug(X86_BUG_SRBDS); + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1193,8 +1296,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); - fpu__init_system(c); - #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -1583,6 +1684,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); } static __init int setup_noclflush(char *arg) @@ -1900,8 +2003,6 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__init_cpu(); - if (is_uv_system()) uv_cpu_init(); @@ -1965,8 +2066,6 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - fpu__init_cpu(); - load_fixmap_gdt(cpu); } #endif @@ -1999,6 +2098,8 @@ void microcode_check(void) perf_check_microcode(); + amd_check_microcode(); + /* Reload CPUID max function as it might've changed. */ info.cpuid_level = cpuid_eax(0); @@ -2017,3 +2118,69 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4eb9bf68b122..ca1b8bf380a2 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -79,9 +79,11 @@ extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern void update_gds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a5287b18a63f..76692590eff8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -71,7 +71,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index f406e3b85bdb..1125f752f126 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -585,8 +585,10 @@ static int __rdtgroup_move_task(struct task_struct *tsk, /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_sched_in() during context switch. */ - barrier(); + smp_mb(); /* * By now, the task's closid and rmid are set. If the task is current @@ -2140,19 +2142,23 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, t->closid = to->closid; t->rmid = to->mon.rmid; -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and resctrl_sched_in() + * during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 751e59057466..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -16,12 +16,17 @@ * respective wildcard entries. * * A typical table entry would be to match a specific CPU - * { X86_VENDOR_INTEL, 6, 0x12 } - * or to match a specific CPU feature - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) @@ -53,3 +58,34 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); + +static const struct x86_cpu_desc * +x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + const struct x86_cpu_desc *m; + + for (m = match; m->x86_family | m->x86_model; m++) { + if (c->x86_vendor != m->x86_vendor) + continue; + if (c->x86 != m->x86_family) + continue; + if (c->x86_model != m->x86_model) + continue; + if (c->x86_stepping != m->x86_stepping) + continue; + return m; + } + return NULL; +} + +bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +{ + const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + + if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 9cc524be3c94..14dc3c1f7fb4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -354,7 +354,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - if (cnt > MAX_FLAG_OPT_SIZE) + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2a13468f8773..8f36ccf26cec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -295,11 +295,17 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { - int apei_err = 0; struct llist_node *pending; struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); if (!fake_panic) { /* @@ -314,7 +320,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) - return; + goto out; } pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ @@ -352,6 +358,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); } /* Support code for software error injection */ @@ -642,7 +651,7 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); @@ -1021,10 +1030,13 @@ static int mce_start(int *no_way_out) * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ -static int mce_end(int order) +static noinstr int mce_end(int order) { - int ret = -1; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); if (!timeout) goto reset; @@ -1068,7 +1080,8 @@ static int mce_end(int order) /* * Don't reset anything. That's done by the Monarch. */ - return 0; + ret = 0; + goto out; } /* @@ -1083,6 +1096,10 @@ reset: * Let others run again. */ atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + return ret; } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a4e7e100ed26..5698e04803b5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,7 +54,9 @@ struct cont_desc { }; static u32 ucode_new_rev; -static u8 amd_ucode_patch[PATCH_MAX_SIZE]; + +/* One blob per node. */ +static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio @@ -210,7 +212,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch; + patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -222,7 +224,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -304,8 +312,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; @@ -319,8 +331,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -338,19 +349,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; return 0; } -void reload_ucode_amd(void) +void reload_ucode_amd(unsigned int cpu) { - struct microcode_amd *mc; u32 rev, dummy; + struct microcode_amd *mc; - mc = (struct microcode_amd *)amd_ucode_patch; + mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); @@ -521,7 +532,7 @@ static enum ucode_state apply_microcode_amd(int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ - if (rev >= mc_amd->hdr.patch_id) { + if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } @@ -688,9 +699,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { + struct cpuinfo_x86 *c; + unsigned int nid, cpu; struct ucode_patch *p; enum ucode_state ret; @@ -703,22 +715,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; } - p = find_patch(0); - if (!p) { - return ret; - } else { - if (boot_cpu_data.microcode >= p->patch_id) - return ret; + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); - ret = UCODE_NEW; - } + p = find_patch(cpu); + if (!p) + continue; - /* save BSP's matching patch for early load */ - if (!save) - return ret; + if (c->microcode >= p->patch_id) + continue; + + ret = UCODE_NEW; - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); + memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + } return ret; } @@ -744,12 +756,11 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); - bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || !bsp) + if (!refresh_fw) return UCODE_OK; if (c->x86 >= 0x15) @@ -766,7 +777,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index eab4de387ce6..963b989710f9 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -326,7 +326,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(void) +void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -340,7 +340,7 @@ void reload_early_microcode(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - reload_ucode_amd(); + reload_ucode_amd(cpu); break; default: break; @@ -773,9 +773,9 @@ static struct subsys_interface mc_cpu_interface = { }; /** - * mc_bp_resume - Update boot CPU microcode during resume. + * microcode_bsp_resume - Update boot CPU microcode during resume. */ -static void mc_bp_resume(void) +void microcode_bsp_resume(void) { int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -783,11 +783,11 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); else if (!uci->mc) - reload_early_microcode(); + reload_early_microcode(cpu); } static struct syscore_ops mc_syscore_ops = { - .resume = mc_bp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 3aa0e5a45303..31ad79a0c6f3 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -662,7 +662,6 @@ void load_ucode_intel_ap(void) else iup = &intel_ucode_patch; -reget: if (!*iup) { patch = __load_ucode_intel(&uci); if (!patch) @@ -673,12 +672,7 @@ reget: uci.mc = *iup; - if (apply_microcode_early(&uci, true)) { - /* Mixed-silicon system? Try to refetch the proper patch: */ - *iup = NULL; - - goto reget; - } + apply_microcode_early(&uci, true); } static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 5a52672e3f8b..90bd155d7e7a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,6 +21,7 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 71ca064e3794..31fe56a90cbf 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -44,7 +44,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; - smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; } @@ -68,7 +68,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * Populate HT related information from sub-leaf level 0. */ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 032509adf9de..88a553ee7704 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -55,24 +55,6 @@ void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool __init tsx_ctrl_is_supported(void) -{ - u64 ia32_cap = x86_read_arch_cap_msr(); - - /* - * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this - * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. - * - * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a - * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES - * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get - * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, - * tsx= cmdline requests will do nothing on CPUs without - * MSR_IA32_TSX_CTRL support. - */ - return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); -} - static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) { if (boot_cpu_has_bug(X86_BUG_TAA)) @@ -86,9 +68,22 @@ void __init tsx_init(void) char arg[5] = {}; int ret; - if (!tsx_ctrl_is_supported()) + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR)) return; + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); if (ret >= 0) { if (!strcmp(arg, "on")) { diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 91b3483e5085..e40bb50c5c4f 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -35,7 +35,6 @@ #include <linux/kdebug.h> #include <asm/cpu.h> #include <asm/reboot.h> -#include <asm/virtext.h> #include <asm/intel_pt.h> /* Used while preparing memory map entries for second kernel */ @@ -86,15 +85,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Disable VMX or SVM if needed. - * - * We need to disable virtualization on all CPUs. - * Having VMX or SVM enabled on any CPU may break rebooting - * after the kdump kernel has finished its task. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); - /* * Disable Intel PT to stop its logging */ @@ -153,12 +143,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Booting kdump kernel with VMX or SVM enabled won't work, - * because (among other limitations) we can't disable paging - * with the virt flags. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); + cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b5886401e5f..7e698c45760c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -171,7 +171,6 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -190,9 +189,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { const char *stack_name; + stack = PTR_ALIGN(stack, sizeof(long)); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { /* * We weren't on a valid stack. It's possible that @@ -326,7 +329,7 @@ unsigned long oops_begin(void) } NOKPROBE_SYMBOL(oops_begin); -void __noreturn rewind_stack_do_exit(int signr); +void __noreturn rewind_stack_and_make_dead(int signr); void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -361,7 +364,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) * reuse the task stack and that existing poisons are invalid. */ kasan_unpoison_task_stack(current); - rewind_stack_do_exit(signr); + rewind_stack_and_make_dead(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 50d5848bf22e..d5352b0ae091 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = { .stolen_size = gen9_stolen_size, }; +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -584,6 +585,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func) u16 device; int i; + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { @@ -696,7 +704,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_quirks }, + 0, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 9692ccc583bb..644372a10c89 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -49,7 +49,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -static bool fpu__probe_without_cpuid(void) +static bool __init fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -67,7 +67,7 @@ static bool fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static void __init fpu__init_system_early_generic(void) { if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { @@ -297,10 +297,10 @@ static void __init fpu__init_parse_early_param(void) * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ -void __init fpu__init_system(struct cpuinfo_x86 *c) +void __init fpu__init_system(void) { fpu__init_parse_early_param(); - fpu__init_system_early_generic(c); + fpu__init_system_early_generic(); /* * The FPU has to be operational for some of the diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 7d372db8bee1..e33b732ad337 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -811,6 +811,14 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + + /* + * CPU capabilities initialization runs before FPU init. So + * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely + * functional, set the feature bit so depending code works. + */ + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 88dc38b4a147..90c2613af36b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -383,6 +383,8 @@ static void __init clear_bss(void) { memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); } static unsigned long get_cmd_line_ptr(void) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index fe522691ac71..82753622f489 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -114,6 +115,7 @@ static void make_8259A_irq(unsigned int irq) disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); + irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } @@ -300,15 +302,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -430,5 +449,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..f2c215e1f64c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -72,8 +72,10 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); - for (i = 0; i < nr_legacy_irqs(); i++) + for (i = 0; i < nr_legacy_irqs(); i++) { irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } } void __init init_IRQ(void) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 544bd41a514c..36b5a493e5b4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -56,8 +56,8 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) /* This function only handles jump-optimized kprobe */ if (kp && kprobe_optimized(kp)) { op = container_of(kp, struct optimized_kprobe, kp); - /* If op->list is not empty, op is under optimizing */ - if (list_empty(&op->list)) + /* If op is optimized or under unoptimizing */ + if (list_empty(&op->list) || optprobe_queued_unopt(op)) goto found; } } @@ -328,7 +328,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) for (i = 1; i < op->optinsn.size; i++) { p = get_kprobe(op->kp.addr + i); - if (p && !kprobe_disabled(p)) + if (p && !kprobe_disarmed(p)) return -EEXIST; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cee45d46e67d..bd6ee0bfa1d2 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -480,7 +480,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cd138bfd926c..e8d40a5979ec 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -434,7 +434,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - wrmsrl(MSR_IA32_SPEC_CTRL, msr); + update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 736348ead421..2ecf1dcc86b2 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -96,7 +96,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -186,7 +186,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -208,7 +208,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -299,7 +299,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -430,7 +430,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index b0f3a996df15..444b8a691c14 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -536,33 +536,29 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct pt_regs *regs) -{ - cpu_emergency_vmxoff(); -} +static inline void nmi_shootdown_cpus_on_restart(void); -/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ -static void emergency_vmx_disable_all(void) +static void emergency_reboot_disable_virtualization(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* - * Disable VMX on all CPUs before rebooting, otherwise we risk hanging - * the machine, because the CPU blocks INIT when it's in VMX root. + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. * * We can't take any locks and we may be on an inconsistent state, so - * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if VMX if off on _this_ CPU, as that - * doesn't prevent a different CPU from being in VMX root operation. + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx()) { - /* Safely force _this_ CPU out of VMX root operation. */ - __cpu_emergency_vmxoff(); + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); - /* Halt and exit VMX root operation on the other CPUs. */ - nmi_shootdown_cpus(vmxoff_nmi); + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); } } @@ -599,7 +595,7 @@ static void native_machine_emergency_restart(void) unsigned short mode; if (reboot_emergency) - emergency_vmx_disable_all(); + emergency_reboot_disable_virtualization(); tboot_shutdown(TB_SHUTDOWN_REBOOT); @@ -804,6 +800,17 @@ void machine_crash_shutdown(struct pt_regs *regs) /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; @@ -826,7 +833,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, regs); + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -842,18 +856,32 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -/* - * Halt all other CPUs, calling the specified function on each of them +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. * - * This function can be used to halt all other CPUs on crash - * or emergency reboot time. The function passed as parameter - * will be called inside a NMI handler on all CPUs. + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); @@ -881,7 +909,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) msecs--; } - /* Leave the nmi callback set */ + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); } /* @@ -911,6 +949,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* No other CPUs to shoot down */ } +static inline void nmi_shootdown_cpus_on_restart(void) { } + void run_crash_ipi_callback(struct pt_regs *regs) { } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b2b87b91f336..c94ed0b37bcd 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -33,7 +33,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> -#include <asm/virtext.h> +#include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -163,7 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -176,7 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { ipi_entering_ack_irq(); - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); irq_exit(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8783d065f927..2e4f6a1ebca5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -96,6 +96,17 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + /* Logical package management. We might want to allocate that dynamically */ unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); @@ -231,6 +242,7 @@ static void notrace start_secondary(void *unused) #endif load_current_idt(); cpu_init(); + fpu__init_cpu(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -1594,10 +1606,10 @@ static bool wakeup_cpu0(void) */ static inline void mwait_play_dead(void) { + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - void *mwait_ptr; int i; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) @@ -1631,13 +1643,6 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } - /* - * This should be a memory location in a cache line which is - * unlikely to be touched by other processors. The actual - * content is immaterial as it is not actually modified in any way. - */ - mwait_ptr = ¤t_thread_info()->flags; - wbinvd(); while (1) { @@ -1649,9 +1654,9 @@ static inline void mwait_play_dead(void) * case where we return around the loop. */ mb(); - clflush(mwait_ptr); + clflush(md); mb(); - __monitor(mwait_ptr, 0, 0); + __monitor(md, 0, 0); mb(); __mwait(eax, 0); /* diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..2f97d1a1032f 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -175,8 +175,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6a78d4b36a79..9fb266c797fb 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -70,9 +70,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -82,11 +79,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 897da526e40e..5bc0fedb3342 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -265,6 +265,22 @@ static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { "Lenovo ideapad D330-10IGM"), }, }, + { + /* Lenovo IdeaPad Duet 3 10IGL5 with 1200x1920 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "IdeaPad Duet 3 10IGL5"), + }, + }, + { + /* Lenovo Yoga Book X91F / X91L */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + /* Non exact match to match F + L versions */ + DMI_MATCH(DMI_PRODUCT_NAME, "Lenovo YB1-X91"), + }, + }, {}, }; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 4f17c1c94949..0c0f0eda327d 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -89,22 +89,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) @@ -663,7 +668,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ae9e806a11de..3b87bca027b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -735,8 +735,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -765,6 +766,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } +setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 34c0652ca8b4..20d09355c9e0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -383,7 +383,7 @@ SECTIONS .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ + *(.bss..brk) /* areas brk users have reserved */ __brk_limit = .; } diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index dc4f2fdf5e57..13fd54de5449 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx.o pmu_intel.o +kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 097eef712cdc..768a765c49b0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -532,6 +532,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, union cpuid10_eax eax; union cpuid10_edx edx; + if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + perf_get_x86_pmu_capability(&cap); /* @@ -675,6 +680,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; /* * IBRS, IBPB and VIRT_SSBD aren't necessarily present in diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3e182c7ae771..0cb75d29e67b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -759,8 +759,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ctxt->mode, linear); } -static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, - enum x86emul_mode mode) +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; @@ -770,41 +769,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } +static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + struct desc_struct cs; + u16 selector; + u32 base3; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { + /* Real mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_REAL; + return X86EMUL_CONTINUE; + } + + if (ctxt->eflags & X86_EFLAGS_VM) { + /* Protected/VM86 mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_VM86; + return X86EMUL_CONTINUE; + } + + if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) + return X86EMUL_UNHANDLEABLE; + + if (efer & EFER_LMA) { + if (cs.l) { + /* Proper long mode */ + ctxt->mode = X86EMUL_MODE_PROT64; + } else if (cs.d) { + /* 32 bit compatibility mode*/ + ctxt->mode = X86EMUL_MODE_PROT32; + } else { + ctxt->mode = X86EMUL_MODE_PROT16; + } + } else { + /* Legacy 32 bit / 16 bit mode */ + ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + } + + return X86EMUL_CONTINUE; +} + static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { - return assign_eip(ctxt, dst, ctxt->mode); + return assign_eip(ctxt, dst); } -static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - const struct desc_struct *cs_desc) +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { - enum x86emul_mode mode = ctxt->mode; - int rc; + int rc = emulator_recalc_and_set_mode(ctxt); -#ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT16) { - if (cs_desc->l) { - u64 efer = 0; + if (rc != X86EMUL_CONTINUE) + return rc; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; - } else - mode = X86EMUL_MODE_PROT32; /* temporary value */ - } -#endif - if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) - mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - rc = assign_eip(ctxt, dst, mode); - if (rc == X86EMUL_CONTINUE) - ctxt->mode = mode; - return rc; + return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1669,11 +1698,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - if (!seg_desc.p) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } - dpl = seg_desc.dpl; switch (seg) { @@ -1713,12 +1737,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1737,6 +1755,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, break; } + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + if (seg_desc.s) { /* mark segment as accessed */ if (!(seg_desc.type & 1)) { @@ -1751,8 +1774,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); + } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -1972,7 +2004,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->modrm_reg == VCPU_SREG_SS) + if (seg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; if (ctxt->op_bytes > 2) rsp_increment(ctxt, ctxt->op_bytes - 2); @@ -2189,7 +2221,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2270,7 +2302,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, &new_desc); + rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2892,6 +2924,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; + ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; @@ -3488,7 +3521,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3635,11 +3668,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + int cr_num = ctxt->modrm_reg; + int r; + + if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; + + if (cr_num == 0) { + /* + * CR0 write might have updated CR0.PE and/or CR0.PG + * which can affect the cpu's execution mode. + */ + r = emulator_recalc_and_set_mode(ctxt); + if (r != X86EMUL_CONTINUE) + return r; + } + return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3fd6c4b2c2b7..2047edb5ff74 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -237,7 +237,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int ret; - if (!synic->active && !host) + if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -283,6 +283,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, case HV_X64_MSR_EOM: { int i; + if (!synic->active) + break; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; @@ -338,6 +341,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) struct kvm_lapic_irq irq; int ret, vector; + if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return -EINVAL; + if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; @@ -544,6 +550,12 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && (!host || config)) + return 1; + trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -558,6 +570,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && (!host || count)) + return 1; + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 56a4b9762b0c..256b00f456e6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -961,6 +961,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { + if (KVM_BUG_ON(!src, kvm)) { + *r = 0; + return true; + } *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } @@ -2045,10 +2049,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { - struct kvm_lapic *apic = vcpu->arch.apic; - - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4)); + apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) @@ -2200,13 +2201,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index acc8d217f656..ad3d39c00d7f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -171,7 +171,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & X86_RAW_EVENT_MASK; + config = eventsel & AMD64_RAW_EVENT_MASK; pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 41dff881e0f0..93a135f216b2 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -247,12 +247,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { + data &= ~pmu->reserved_bits; + if (data != pmc->eventsel) reprogram_gp_counter(pmc, data); - return 0; - } + return 0; } return 1; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 85181457413e..cd3432df0d24 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -50,6 +50,7 @@ #include <asm/kvm_para.h> #include <asm/irq_remapping.h> #include <asm/spec-ctrl.h> +#include <asm/cpu_device_id.h> #include <asm/virtext.h> #include "trace.h" @@ -4154,9 +4155,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data = 0; switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: return 1; @@ -4258,7 +4259,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -4445,7 +4446,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_F10H_DECFG: { + case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; msr_entry.index = msr->index; @@ -5142,8 +5143,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - BUG_ON(!(gif_set(svm))); - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); ++vcpu->stat.irq_injections; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 611f9e60f815..611f9e60f815 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx/vmx.c index 77b9ed5223f3..55e52064c4ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -41,6 +41,7 @@ #include <asm/asm.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/io.h> #include <asm/desc.h> #include <asm/vmx.h> @@ -212,6 +213,9 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; +/* Control for disabling CPU Fill buffer clear */ +static bool __read_mostly vmx_fb_clear_ctrl_available; + static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -1045,6 +1049,8 @@ struct vcpu_vmx { u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; u64 ept_pointer; + u64 msr_ia32_mcu_opt_ctrl; + bool disable_fb_clear; }; enum segment_cache_field { @@ -2056,6 +2062,12 @@ static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); } +static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2107,6 +2119,60 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) BUG_ON(error); } +static void vmx_setup_fb_clear_ctrl(void) +{ + u64 msr; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA)) { + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_FB_CLEAR_CTRL) + vmx_fb_clear_ctrl_available = true; + } +} + +static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) +{ + u64 msr; + + if (!vmx->disable_fb_clear) + return; + + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr |= FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + /* Cache the MSR value to avoid reading it later */ + vmx->msr_ia32_mcu_opt_ctrl = msr; +} + +static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) +{ + if (!vmx->disable_fb_clear) + return; + + vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); +} + +static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +{ + vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + + /* + * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS + * at VMEntry. Skip the MSR read/write when a guest has no use case to + * execute VERW. + */ + if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || + ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) + vmx->disable_fb_clear = false; +} + static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -4314,9 +4380,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; } - ret = kvm_set_msr_common(vcpu, msr_info); + ret = kvm_set_msr_common(vcpu, msr_info); } + /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ + if (msr_index == MSR_IA32_ARCH_CAPABILITIES) + vmx_update_fb_clear_dis(vcpu, vmx); + return ret; } @@ -4670,9 +4740,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - return 0; } @@ -5454,18 +5521,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + ar |= (var->unusable || !var->present) << 16; return ar; } @@ -6761,6 +6825,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vpid_sync_context(vmx->vpid); if (init_event) vmx_clear_hlt(vcpu); + + vmx_update_fb_clear_dis(vcpu, vmx); } /* @@ -7885,6 +7951,9 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -10694,10 +10763,35 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } +u64 __always_inline vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx) +{ + u64 guestval, hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return 0; + + guestval = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + guestval != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); + + return guestval; +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4, evmcs_rsp; + u64 spec_ctrl; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -10773,6 +10867,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&mds_user_clear)) mds_clear_cpu_buffers(); + else if (static_branch_unlikely(&mmio_stale_data_clear) && + kvm_arch_has_assigned_device(vcpu->kvm)) + mds_clear_cpu_buffers(); + + vmx_disable_fb_clear(vmx); asm volatile ( /* Store host registers */ @@ -10919,6 +11018,26 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); /* + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before + * the first unbalanced RET after vmexit! + * + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB + * entries and (in some cases) RSB underflow. + * + * eIBRS has its own protection against poisoned RSB, so it doesn't + * need the RSB filling sequence. But it does need to be enabled, and a + * single call to retire, before the first unbalanced RET. + * + * So no RETs before vmx_spec_ctrl_restore_host() below. + */ + vmexit_fill_RSB(); + + /* Save this for below */ + spec_ctrl = vmx_spec_ctrl_restore_host(vmx); + + vmx_enable_fb_clear(vmx); + + /* * We do not use IBRS in the kernel. If this vCPU has used the * SPEC_CTRL MSR it may have left it on; save the value and * turn it off. This is much more efficient than blindly adding @@ -10934,12 +11053,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * save it. */ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); + vmx->spec_ctrl = spec_ctrl; /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) @@ -12498,6 +12612,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_cpu_has_preemption_timer(vmcs12) && + nested_cpu_has_save_preemption_timer(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -12634,7 +12752,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *exit_qual = ENTRY_FAIL_DEFAULT; @@ -12647,6 +12765,13 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 1; } + if ((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG) + return 1; + + if ((ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + (ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return 1; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -12658,7 +12783,6 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && @@ -13204,14 +13328,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ vmcs12_save_pending_event(vcpu, vmcs12); } - - /* - * Drop what we picked up for L2 via vmx_complete_interrupts. It is - * preserved above and would only end up incorrectly in L1. - */ - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); } /* @@ -13545,6 +13661,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } + /* + * Drop events/exceptions that were queued for re-injection to L2 + * (picked up via vmx_complete_interrupts()), as well as exceptions + * that were pending for L2. Note, this must NOT be hoisted above + * prepare_vmcs12(), events/exceptions queued for re-injection need to + * be captured in vmcs12 (see vmcs12_save_pending_event()). + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); @@ -13751,6 +13878,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; + case x86_intercept_pause: + /* + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides + * with vanilla NOPs in the emulator. Apply the interception + * check only to actual PAUSE instructions. Don't check + * PAUSE-loop-exiting, software can't expect a given PAUSE to + * exit, i.e. KVM is within its rights to allow L2 to execute + * the PAUSE. + */ + if ((info->rep_prefix != REPE_PREFIX) || + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + return X86EMUL_CONTINUE; + + break; + /* TODO: check more intercepts... */ default: break; @@ -14623,8 +14765,11 @@ static int __init vmx_init(void) } } + vmx_setup_fb_clear_ctrl(); + for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx/vmx_evmcs.h index 210a884090ad..210a884090ad 100644 --- a/arch/x86/kvm/vmx_evmcs.h +++ b/arch/x86/kvm/vmx/vmx_evmcs.h diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmx_shadow_fields.h index cd0c75f6d037..cd0c75f6d037 100644 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmx_shadow_fields.h diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 417abc9ba1ad..0548ae57826f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1156,7 +1156,7 @@ static u32 msr_based_features[] = { MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, }; @@ -1220,6 +1220,13 @@ u64 kvm_get_arch_capabilities(void) /* KVM does not emulate MSR_IA32_TSX_CTRL. */ data &= ~ARCH_CAP_TSX_CTRL_MSR; + + /* Guests don't need to know "Fill buffer clear control" exists */ + data &= ~ARCH_CAP_FB_CLEAR_CTRL; + + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + return data; } @@ -2453,6 +2460,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; @@ -2756,6 +2764,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: msr_info->data = 0; break; @@ -3633,12 +3642,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, { unsigned long val; + memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -9732,9 +9740,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 614c2c6b1959..68ca883abfdb 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -43,8 +43,8 @@ static void delay_loop(unsigned long loops) " jnz 2b \n" "3: dec %0 \n" - : /* we don't need output */ - :"a" (loops) + : "+a" (loops) + : ); } diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 33147fef3452..f1024b51cfee 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -22,6 +22,6 @@ */ ENTRY(__iowrite32_copy) movl %edx,%ecx - rep movsd + rep movsl ret ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index a018ec4fba53..c97be9a1430a 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -6,7 +6,7 @@ */ int num_digits(int val) { - int m = 10; + long long m = 10; int d = 1; if (val < 0) { diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 40dbbd8f1fe4..8c6d0fb72b3a 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -161,7 +161,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size) /* cache copy and flush to align dest */ if (!IS_ALIGNED(dest, 8)) { - unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest); + size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest); memcpy((void *) dest, (void *) source, len); clean_cache_range((void *) dest, len); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index b1dba0987565..2c84c5595cf4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -9,6 +9,7 @@ #include <linux/kmemleak.h> #include <asm/set_memory.h> +#include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> @@ -207,6 +208,24 @@ static void __init probe_page_size_mask(void) } } +#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ + .family = 6, \ + .model = _model, \ + } +/* + * INVLPG may not properly flush Global entries + * on these CPUs when PCIDs are enabled. + */ +static const struct x86_cpu_id invlpg_miss_ids[] = { + INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), + {} +}; + static void setup_pcid(void) { if (!IS_ENABLED(CONFIG_X86_64)) @@ -215,6 +234,12 @@ static void setup_pcid(void) if (!boot_cpu_has(X86_FEATURE_PCID)) return; + if (x86_match_cpu(invlpg_miss_ids)) { + pr_info("Incomplete global flushes, disabling PCID"); + setup_clear_cpu_cap(X86_FEATURE_PCID); + return; + } + if (boot_cpu_has(X86_FEATURE_PGE)) { /* * This can't be cr4_set_bits_and_update_boot() -- the diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index adc77904fc3e..7da9b427044c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -171,9 +171,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PHYSICAL_PAGE_MASK; + phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; + /* + * Mask out any bits not part of the actual physical + * address, like memory encryption bits. + */ + phys_addr &= PHYSICAL_PAGE_MASK; + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, pcm, &new_pcm); if (retval) { diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 650d5a6cafc7..832c899b7b73 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -563,7 +563,8 @@ void __init sme_enable(struct boot_params *bp) cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | ((u64)bp->ext_cmd_line_ptr << 32)); - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) + return; if (!strncmp(buffer, cmdline_on, sizeof(buffer))) sme_me_mask = me_mask; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fa150855647c..a830d49341ec 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -585,13 +585,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (start >= end) continue; - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - continue; - alloc_node_data(nid); } @@ -826,7 +819,7 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) return; } mask = node_to_cpumask_map[node]; - if (!mask) { + if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; @@ -872,7 +865,7 @@ const struct cpumask *cpumask_of_node(int node) dump_stack(); return cpu_none_mask; } - if (node_to_cpumask_map[node] == NULL) { + if (!cpumask_available(node_to_cpumask_map[node])) { printk(KERN_WARNING "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 324e26d0607b..b33304c0042a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -74,7 +74,7 @@ int pat_debug_enable; static int __init pat_debug_setup(char *str) { pat_debug_enable = 1; - return 0; + return 1; } __setup("debugpat", pat_debug_setup); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 76959a7d88c8..94291e0ddcb7 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -7,6 +7,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/amd_nb.h> #include <asm/hpet.h> #include <asm/pci_x86.h> @@ -824,3 +825,23 @@ static void rs690_fix_64bit_dma(struct pci_dev *pdev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); #endif + +#ifdef CONFIG_AMD_NB + +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008 +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L + +static void quirk_clear_strap_no_soft_reset_dev2_f0(struct pci_dev *dev) +{ + u32 data; + + if (!amd_smn_read(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, &data)) { + data &= ~AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK; + if (amd_smn_write(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, data)) + pci_err(dev, "Failed to write data 0x%x\n", data); + } else { + pci_err(dev, "Failed to read data\n"); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b8, quirk_clear_strap_no_soft_reset_dev2_f0); +#endif diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 22da9bfd8a45..bacf8d988f65 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -441,6 +441,11 @@ void __init xen_msi_init(void) x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + /* + * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely + * controlled by the hypervisor. + */ + pci_msi_ignore_mask = 1; } #endif diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 7fa8b3b53bc0..193860d7f2c4 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -85,7 +85,7 @@ static void send_ebook_state(void) return; } - if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state) + if (test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == !!state) return; /* Nothing new to report. */ input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3aa3149df07f..75cd943f03a7 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -26,6 +26,7 @@ #include <asm/cpu.h> #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> +#include <asm/microcode.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -41,7 +42,8 @@ static void msr_save_context(struct saved_context *ctxt) struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { - msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q); + if (msr->valid) + rdmsrl(msr->info.msr_no, msr->info.reg.q); msr++; } } @@ -267,6 +269,13 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); + + microcode_bsp_resume(); + + /* + * This needs to happen after the microcode has been updated upon resume + * because some of the MSRs are "emulated" in microcode. + */ msr_restore_context(ctxt); } @@ -426,8 +435,10 @@ static int msr_build_context(const u32 *msr_id, const int num) } for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + u64 dummy; + msr_array[i].info.msr_no = msr_id[j]; - msr_array[i].valid = false; + msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; @@ -514,10 +525,32 @@ static int pm_cpu_check(const struct x86_cpu_id *c) return ret; } +static void pm_save_spec_msr(void) +{ + struct msr_enumeration { + u32 msr_no; + u32 feature; + } msr_enum[] = { + { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL }, + { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL }, + { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT }, + { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL }, + { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD }, + { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(msr_enum); i++) { + if (boot_cpu_has(msr_enum[i].feature)) + msr_build_context(&msr_enum[i].msr_no, 1); + } +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); pm_cpu_check(msr_save_cpu_table); + pm_save_spec_msr(); return 0; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 2cfa0caef133..00f104e341e5 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -12,6 +12,11 @@ $(obj)/string.o: $(srctree)/arch/x86/boot/compressed/string.c FORCE $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE $(call if_changed_rule,cc_o_c) +# When profile-guided optimization is enabled, llvm emits two different +# overlapping text sections, which is not supported by kexec. Remove profile +# optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS)) + LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro @@ -25,7 +30,7 @@ KCOV_INSTRUMENT := n # make up the standalone purgatory.ro PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel -PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss +PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0 PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That @@ -56,6 +61,9 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) +AFLAGS_REMOVE_setup-x86_$(BITS).o += -g -Wa,-gdwarf-2 +AFLAGS_REMOVE_entry64.o += -g -Wa,-gdwarf-2 + $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 3ee234b6234d..255a44dd415a 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -23,9 +23,11 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func, { long res; void *stub_addr; + + BUILD_BUG_ON(sizeof(*desc) % sizeof(long)); + res = syscall_stub_data(mm_idp, (unsigned long *)desc, - (sizeof(*desc) + sizeof(long) - 1) & - ~(sizeof(long) - 1), + sizeof(*desc) / sizeof(long), addr, &stub_addr); if (!res) { unsigned long args[] = { func, diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h index 68fd2cf526fd..f6e9f84397e7 100644 --- a/arch/x86/um/shared/sysdep/syscalls_32.h +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -6,10 +6,9 @@ #include <asm/unistd.h> #include <sysdep/ptrace.h> -typedef long syscall_handler_t(struct pt_regs); +typedef long syscall_handler_t(struct syscall_args); extern syscall_handler_t *sys_call_table[]; #define EXECUTE_SYSCALL(syscall, regs) \ - ((long (*)(struct syscall_args)) \ - (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) + ((*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h index 8a7d5e1da98e..1e6875b4ffd8 100644 --- a/arch/x86/um/shared/sysdep/syscalls_64.h +++ b/arch/x86/um/shared/sysdep/syscalls_64.h @@ -10,13 +10,12 @@ #include <linux/msg.h> #include <linux/shm.h> -typedef long syscall_handler_t(void); +typedef long syscall_handler_t(long, long, long, long, long, long); extern syscall_handler_t *sys_call_table[]; #define EXECUTE_SYSCALL(syscall, regs) \ - (((long (*)(long, long, long, long, long, long)) \ - (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ + (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ UPT_SYSCALL_ARG2(®s->regs), \ UPT_SYSCALL_ARG3(®s->regs), \ UPT_SYSCALL_ARG4(®s->regs), \ diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 58f51667e2e4..8249685b4096 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -11,6 +11,7 @@ #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> +#include <registers.h> long arch_prctl(struct task_struct *task, int option, unsigned long __user *arg2) @@ -35,7 +36,7 @@ long arch_prctl(struct task_struct *task, int option, switch (option) { case ARCH_SET_FS: case ARCH_SET_GS: - ret = restore_registers(pid, ¤t->thread.regs.regs); + ret = restore_pid_registers(pid, ¤t->thread.regs.regs); if (ret) return ret; break; diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 5bd949da7a4a..b69ab2409430 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -65,9 +65,6 @@ static int get_free_idx(struct task_struct* task) struct thread_struct *t = &task->thread; int idx; - if (!t->arch.tls_array) - return GDT_ENTRY_TLS_MIN; - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) if (!t->arch.tls_array[idx].present) return idx + GDT_ENTRY_TLS_MIN; @@ -242,9 +239,6 @@ static int get_tls_entry(struct task_struct *task, struct user_desc *info, { struct thread_struct *t = &task->thread; - if (!t->arch.tls_array) - goto clear; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index 7c441b59d375..be99ff25c503 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -20,8 +20,10 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm("syscall" + : "=a" (ret) + : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) + : "rcx", "r11", "memory"); return ret; } @@ -32,8 +34,10 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm("syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday), "D" (tv), "S" (tz) + : "rcx", "r11", "memory"); return ret; } diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 95997e6c0696..9813298ba57d 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -505,10 +505,7 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) return ret; } -bool is_xen_pmu(int cpu) -{ - return (get_xenpmu_data() != NULL); -} +bool is_xen_pmu; void xen_pmu_init(int cpu) { @@ -519,7 +516,7 @@ void xen_pmu_init(int cpu) BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); - if (xen_hvm_domain()) + if (xen_hvm_domain() || (cpu != 0 && !is_xen_pmu)) return; xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); @@ -540,7 +537,8 @@ void xen_pmu_init(int cpu) per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; per_cpu(xenpmu_shared, cpu).flags = 0; - if (cpu == 0) { + if (!is_xen_pmu) { + is_xen_pmu = true; perf_register_guest_info_callbacks(&xen_guest_cbs); xen_pmu_arch_init(); } diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index 0e83a160589b..65c58894fc79 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -4,6 +4,8 @@ #include <xen/interface/xenpmu.h> +extern bool is_xen_pmu; + irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); #ifdef CONFIG_XEN_HAVE_VPMU void xen_pmu_init(int cpu); @@ -12,7 +14,6 @@ void xen_pmu_finish(int cpu); static inline void xen_pmu_init(int cpu) {} static inline void xen_pmu_finish(int cpu) {} #endif -bool is_xen_pmu(int cpu); bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); int pmu_apic_update(uint32_t reg); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7a43b2ae19f1..a1cc855c539c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) void xen_smp_intr_free(unsigned int cpu) { + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); per_cpu(xen_resched_irq, cpu).irq = -1; - kfree(per_cpu(xen_resched_irq, cpu).name); - per_cpu(xen_resched_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); per_cpu(xen_callfunc_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfunc_irq, cpu).name); - per_cpu(xen_callfunc_irq, cpu).name = NULL; } + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; if (per_cpu(xen_debug_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); per_cpu(xen_debug_irq, cpu).irq = -1; - kfree(per_cpu(xen_debug_irq, cpu).name); - per_cpu(xen_debug_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, NULL); per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); - per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } } @@ -65,6 +65,7 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, @@ -74,9 +75,9 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_resched_irq, cpu).irq = rc; - per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, @@ -86,18 +87,21 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfunc_irq, cpu).irq = rc; - per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; - debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); - rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, - IRQF_PERCPU | IRQF_NOBALANCING, - debug_name, NULL); - if (rc < 0) - goto fail; - per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; + if (!xen_fifo_events) { + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + per_cpu(xen_debug_irq, cpu).name = debug_name; + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, + xen_debug_interrupt, + IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_debug_irq, cpu).irq = rc; + } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, @@ -107,7 +111,6 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; - per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; return 0; diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f8d39440b292..e5bd9eb42191 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -19,6 +19,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) xen_vcpu_setup(0); /* + * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS. + * Refer to comments in xen_hvm_init_time_ops(). + */ + xen_hvm_init_time_ops(); + + /* * The alternative logic (which patches the unlock/lock) runs before * the smp bootup up code is activated. Hence we need to set this up * the core kernel is being patched. Otherwise we will have only diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 41fd4c123165..66f83562d329 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -27,6 +27,7 @@ #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/cpu.h> +#include <asm/fpu/internal.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> @@ -58,6 +59,7 @@ static void cpu_bringup(void) int cpu; cpu_init(); + fpu__init_cpu(); touch_softlockup_watchdog(); preempt_disable(); @@ -94,18 +96,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void) void xen_smp_intr_free_pv(unsigned int cpu) { + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; if (per_cpu(xen_irq_work, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; } + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; } } @@ -115,6 +117,7 @@ int xen_smp_intr_init_pv(unsigned int cpu) char *callfunc_name, *pmu_name; callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + per_cpu(xen_irq_work, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, @@ -124,10 +127,10 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; - if (is_xen_pmu(cpu)) { + if (is_xen_pmu) { pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + per_cpu(xen_pmu_irq, cpu).name = pmu_name; rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, xen_pmu_irq_handler, IRQF_PERCPU|IRQF_NOBALANCING, @@ -135,7 +138,6 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; } return 0; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 6fffb86a32ad..e6cf1b430fd0 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -83,6 +83,7 @@ void xen_init_lock_cpu(int cpu) cpu, per_cpu(lock_kicker_irq, cpu)); name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + per_cpu(irq_name, cpu) = name; irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, @@ -93,7 +94,6 @@ void xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; - per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -106,6 +106,8 @@ void xen_uninit_lock_cpu(int cpu) if (!xen_pvspin) return; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; /* * When booting the kernel with 'mitigations=auto,nosmt', the secondary * CPUs are not activated, and lock_kicker_irq is not initialized. @@ -116,8 +118,6 @@ void xen_uninit_lock_cpu(int cpu) unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; - kfree(per_cpu(irq_name, cpu)); - per_cpu(irq_name, cpu) = NULL; } PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 01dcccf9185f..9809de9f2310 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -547,6 +547,11 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { + static bool hvm_time_initialized; + + if (hvm_time_initialized) + return; + /* * vector callback is needed otherwise we cannot receive interrupts * on cpu > 0 and at this point we don't know how many cpus are @@ -556,7 +561,22 @@ void __init xen_hvm_init_time_ops(void) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { - pr_info("Xen doesn't support pvclock on HVM, disable pv timer"); + pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer"); + return; + } + + /* + * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'. + * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest + * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access + * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic. + * + * The xen_hvm_init_time_ops() should be called again later after + * __this_cpu_read(xen_vcpu) is available. + */ + if (!__this_cpu_read(xen_vcpu)) { + pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n", + xen_vcpu_nr(0)); return; } @@ -568,5 +588,7 @@ void __init xen_hvm_init_time_ops(void) x86_platform.calibrate_tsc = xen_tsc_khz; x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; + + hvm_time_initialized = true; } #endif diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2f111f47ba98..9faec8543237 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,6 +30,8 @@ extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; +extern bool xen_fifo_events; + void xen_setup_mfn_list_list(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); |