diff options
Diffstat (limited to 'arch/x86')
50 files changed, 524 insertions, 241 deletions
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 1c060748c813..f38ffcc610d2 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -87,7 +87,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 01d628ea3402..c6c4b877f3d2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -49,16 +49,17 @@ * Position Independent Executable (PIE) so that linker won't optimize * R_386_GOT32X relocation to its fixed symbol address. Older * linkers generate R_386_32 relocations against locally defined symbols, - * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less + * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle * R_386_32 relocations when relocating the kernel. To generate - * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as + * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as * hidden: */ .hidden _bss .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD ENTRY(startup_32) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index a25127916e67..7ab1c6bcc66a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -41,6 +41,7 @@ .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD .code32 diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 0eb9f92f3717..136972a4c454 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -217,7 +217,6 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y # CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set @@ -247,6 +246,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e32fc1f274d8..d387193ef7fc 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -213,7 +213,6 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y # CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set @@ -243,6 +242,7 @@ CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 5f6a5af9c489..77043a82da51 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -127,10 +127,6 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ -.macro setddq n - var_ddq_add = ddq_add_\n -.endm - /* generate a unique variable for xmm register */ .macro setxdata n var_xdata = %xmm\n @@ -140,9 +136,7 @@ ddq_add_8: .macro club name, id .altmacro - .if \name == DDQ_DATA - setddq %\id - .elseif \name == XDATA + .if \name == XDATA setxdata %\id .endif .noaltmacro @@ -165,9 +159,8 @@ ddq_add_8: .set i, 1 .rept (by - 1) - club DDQ_DATA, i club XDATA, i - vpaddq var_ddq_add(%rip), xcounter, var_xdata + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata vptest ddq_low_msk(%rip), var_xdata jnz 1f vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata @@ -180,8 +173,7 @@ ddq_add_8: vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - club DDQ_DATA, by - vpaddq var_ddq_add(%rip), xcounter, xcounter + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter vptest ddq_low_msk(%rip), xcounter jnz 1f vpaddq ddq_high_add_1(%rip), xcounter, xcounter diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index baeba0567126..884466592943 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -9,6 +9,33 @@ #include <linux/mod_devicetable.h> +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) + +/** + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * Backport version to keep the SRBDS pile consistant. No shorter variants + * required for this. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ + .vendor = X86_VENDOR_##_vendor, \ + .family = _family, \ + .model = _model, \ + .steppings = _steppings, \ + .feature = _feature, \ + .driver_data = (unsigned long) _data \ +} + extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 157cfaf1064c..e08866cd2287 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -291,6 +291,7 @@ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ @@ -346,6 +347,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -390,5 +392,6 @@ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 00f7cf45e699..8e95aa4b0d17 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -74,7 +74,7 @@ #define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT)) #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9529fe69e1d9..ecb6009a2c8a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1060,7 +1060,7 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5761a86b88e0..9dc445ac7f19 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -110,6 +110,10 @@ #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ #define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ +/* SRBDS support */ +#define MSR_IA32_MCU_OPT_CTRL 0x00000123 +#define RNGDS_MITG_DIS BIT(0) + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index b73a16a56e4f..270448b178a7 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -232,6 +232,7 @@ enum spectre_v2_mitigation { enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, + SPECTRE_V2_USER_STRICT_PREFERRED, SPECTRE_V2_USER_PRCTL, SPECTRE_V2_USER_SECCOMP, }; @@ -329,7 +330,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ -static inline void mds_clear_cpu_buffers(void) +static __always_inline void mds_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; @@ -350,7 +351,7 @@ static inline void mds_clear_cpu_buffers(void) * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_user_clear_cpu_buffers(void) +static __always_inline void mds_user_clear_cpu_buffers(void) { if (static_branch_likely(&mds_user_clear)) mds_clear_cpu_buffers(); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 98a337e3835d..2f7a4018b6e4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -234,6 +234,7 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 851c04b7a092..1572a436bc08 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -4,6 +4,11 @@ #define ARCH_DEFAULT_PKEY 0 +/* + * If more than 16 keys are ever supported, a thorough audit + * will be necessary to ensure that the types that store key + * numbers and masks have sufficient capacity. + */ #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a87eda9691e..56a89519dc14 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -344,7 +344,7 @@ struct x86_hw_tss { #define INVALID_IO_BITMAP_OFFSET 0x8000 struct entry_stack { - unsigned long words[64]; + char stack[PAGE_SIZE]; }; struct entry_stack_page { diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 971830341061..82b0ff6cac97 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -711,7 +711,17 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -#define user_access_begin() __uaccess_begin() +static __must_check inline bool user_access_begin(int type, + const void __user *ptr, + size_t len) +{ + if (unlikely(!access_ok(type, ptr, len))) + return 0; + __uaccess_begin_nospec(); + return 1; +} + +#define user_access_begin(a, b, c) user_access_begin(a, b, c) #define user_access_end() __uaccess_end() #define unsafe_put_user(x, ptr, err_label) \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 48ab5fdd1044..ee33f0951322 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1915,7 +1915,7 @@ void __init init_apic_mappings(void) unsigned int new_apicid; if (apic_validate_deadline_timer()) - pr_debug("TSC deadline timer available\n"); + pr_info("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2271adbc3c42..be226cdd08d3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2160,6 +2160,7 @@ static inline void __init check_timer(void) legacy_pic->init(0); legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); unlock_ExtINT_logic(); @@ -2233,12 +2234,12 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); - /* Release fw handle if it was allocated above */ - if (!cfg->dev) - irq_domain_free_fwnode(fn); - - if (!ip->irqdomain) + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); return -ENOMEM; + } ip->irqdomain->parent = parent; @@ -2252,8 +2253,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index f10e7f93b0e2..8c102d62b859 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -149,10 +149,11 @@ void __init arch_init_msi_domain(struct irq_domain *parent) msi_default_domain = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, parent); - irq_domain_free_fwnode(fn); } - if (!msi_default_domain) + if (!msi_default_domain) { + irq_domain_free_fwnode(fn); pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + } } #ifdef CONFIG_IRQ_REMAP @@ -185,7 +186,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, if (!fn) return NULL; d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) + irq_domain_free_fwnode(fn); return d; } #endif @@ -248,7 +250,8 @@ static struct irq_domain *dmar_get_irq_domain(void) if (fn) { dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); + if (!dmar_domain) + irq_domain_free_fwnode(fn); } out: mutex_unlock(&dmar_lock); @@ -373,7 +376,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) } d = msi_create_irq_domain(fn, domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } return d; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b958082c74a7..637cf4dfccc9 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -368,6 +368,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irq_data); + err = assign_irq_vector_policy(virq + i, node, data, info, irq_data); if (err) { @@ -457,7 +461,6 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7896a34f53b5..245184152892 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -60,7 +61,7 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; u64 __ro_after_init x86_amd_ls_cfg_base; u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; -/* Control conditional STIPB in switch_to() */ +/* Control conditional STIBP in switch_to() */ DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); /* Control conditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -391,6 +393,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -488,7 +581,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline); static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; -static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_RETPOLINE @@ -540,10 +635,11 @@ enum spectre_v2_user_cmd { }; static const char * const spectre_v2_user_strings[] = { - [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", - [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", - [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", - [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", + [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; static const struct { @@ -655,23 +751,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); + + spectre_v2_user_ibpb = mode; } - /* If enhanced IBRS is enabled no STIPB required */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * required. + */ + if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* - * If SMT is not possible or STIBP is not available clear the STIPB - * mode. + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. */ - if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + /* + * If STIBP is not available, clear the STIBP mode. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP)) mode = SPECTRE_V2_USER_NONE; + + spectre_v2_user_stibp = mode; + set_mode: - spectre_v2_user = mode; - /* Only print the STIBP mode when SMT possible */ - if (smt_possible) - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { @@ -902,10 +1011,11 @@ void arch_smt_update(void) { mutex_lock(&spec_ctrl_mutex); - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: @@ -1134,13 +1244,19 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; /* * Indirect branch speculation is always disabled in strict - * mode. + * mode. It can neither be enabled if it was force-disabled + * by a previous prctl call. + */ - if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + task_spec_ib_force_disable(task)) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); @@ -1151,9 +1267,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always allowed when * mitigation is force disabled. */ - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return 0; task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) @@ -1184,7 +1303,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); - if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif @@ -1213,21 +1333,24 @@ static int ib_prctl_get(struct task_struct *task) if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case SPECTRE_V2_USER_STRICT: - return PR_SPEC_DISABLE; - default: + } else return PR_SPEC_NOT_AFFECTED; - } } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) @@ -1466,11 +1589,13 @@ static char *stibp_state(void) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return ""; - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; + case SPECTRE_V2_USER_STRICT_PREFERRED: + return ", STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) @@ -1491,6 +1616,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1532,6 +1662,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1578,4 +1711,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7b4141889919..64066a2497e4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -854,6 +854,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; #endif + c->x86_cache_bits = c->x86_phys_bits; if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -894,7 +895,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) } } #endif - c->x86_cache_bits = c->x86_phys_bits; } #define NO_SPECULATION BIT(0) @@ -964,9 +964,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -986,29 +1007,32 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1026,7 +1050,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1035,7 +1068,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1451,6 +1484,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index db10a63687d3..432058e5e44b 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -69,6 +69,7 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 3fed38812eea..751e59057466 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -34,13 +34,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index e57b59762f9f..94aa91b09c28 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -518,7 +518,7 @@ static void do_inject(void) */ if (inj_type == DFR_INT_INJ) { i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + i_mce.status &= ~MCI_STATUS_UC; } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 87a57b7642d3..601a5da1d196 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -907,8 +907,6 @@ const void *get_xsave_field_ptr(int xsave_state) #ifdef CONFIG_ARCH_HAS_PKEYS -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. @@ -927,6 +925,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + WARN_ON_ONCE(pkey >= arch_max_pkey()); + /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; @@ -964,18 +969,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) return true; } -/* - * This is similar to user_regset_copyout(), but will not add offset to - * the source data pointer or increment pos, count, kbuf, and ubuf. - */ -static inline void -__copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int offset, unsigned int size, unsigned int size_total) +static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count) { - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); + if (*pos < to) { + unsigned size = to - *pos; + + if (size > *count) + size = *count; + memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size); + *kbuf += size; + *pos += size; + *count -= size; + } +} - memcpy(kbuf + offset, data, copy); +static void copy_part(unsigned offset, unsigned size, void *from, + void **kbuf, unsigned *pos, unsigned *count) +{ + fill_gap(offset, kbuf, pos, count); + if (size > *count) + size = *count; + if (size) { + memcpy(*kbuf, from, size); + *kbuf += size; + *pos += size; + *count -= size; } } @@ -988,8 +1006,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, */ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { - unsigned int offset, size; struct xstate_header header; + const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr); + unsigned count = size_total; int i; /* @@ -1005,46 +1024,42 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of header.xfeatures = xsave->header.xfeatures; header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(0, off_mxcsr, + &xsave->i387, &kbuf, &offset_start, &count); + if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)) + copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE, + &xsave->i387.mxcsr, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(offsetof(struct fxregs_state, st_space), 128, + &xsave->i387.st_space, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_SSE) + copy_part(xstate_offsets[XFEATURE_SSE], 256, + &xsave->i387.xmm_space, &kbuf, &offset_start, &count); + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + copy_part(offsetof(struct fxregs_state, sw_reserved), 48, + xstate_fx_sw_bytes, &kbuf, &offset_start, &count); /* * Copy xregs_state->header: */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); - - __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); + copy_part(offsetof(struct xregs_state, header), sizeof(header), + &header, &kbuf, &offset_start, &count); - for (i = 0; i < XFEATURE_MAX; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { /* * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { void *src = __raw_xsave_addr(xsave, 1 << i); - offset = xstate_offsets[i]; - size = xstate_sizes[i]; - - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; - - __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); + copy_part(xstate_offsets[i], xstate_sizes[i], + src, &kbuf, &offset_start, &count); } } - - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); - } - - /* - * Fill xsave->i387.sw_reserved value for ptrace frame: - */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); - - __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + fill_gap(size_total, &kbuf, &offset_start, &count); return 0; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 02abc134367f..f7833ae4e3f1 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -206,7 +206,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 9d7bb8de2917..02665ffef050 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -744,16 +744,11 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); -static struct kprobe kretprobe_kprobe = { - .addr = (void *)kretprobe_trampoline, -}; - /* * Called from kretprobe_trampoline */ __visible __used void *trampoline_handler(struct pt_regs *regs) { - struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; @@ -763,16 +758,12 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) void *frame_pointer; bool skipped = false; - preempt_disable(); - /* * Set a dummy kprobe for avoiding kretprobe recursion. * Since kretprobe never run in kprobe handler, kprobe must not * be running at this point. */ - kcb = get_kprobe_ctlblk(); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + kprobe_busy_begin(); INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); @@ -851,7 +842,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) __this_cpu_write(current_kprobe, &ri->rp->kp); ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); + __this_cpu_write(current_kprobe, &kprobe_busy); } recycle_rp_inst(ri, &empty_rp); @@ -867,8 +858,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); - __this_cpu_write(current_kprobe, NULL); - preempt_enable(); + kprobe_busy_end(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d2ef967bfafb..a07b09f68e7e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -414,28 +414,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; - /* - * If TIF_SSBD is different, select the proper mitigation - * method. Note that if SSBD mitigation is disabled or permanentely - * enabled this branch can't be taken because nothing can set - * TIF_SSBD. - */ - if (tif_diff & _TIF_SSBD) { - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); - } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); - } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - msr |= ssbd_tif_to_spec_ctrl(tifn); - updmsr = true; - } + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); } - /* - * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, - * otherwise avoid the MSR write. - */ + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 898e97cf6629..320ab978fb1f 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -19,7 +19,7 @@ static inline void switch_to_extra(struct task_struct *prev, if (IS_ENABLED(CONFIG_SMP)) { /* * Avoid __switch_to_xtra() invocation when conditional - * STIPB is disabled and the only different bit is + * STIBP is disabled and the only different bit is * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not * in the TIF_WORK_CTXSW masks. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 734549492a18..dc4d27000aa3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -374,7 +374,7 @@ static unsigned long task_seg_base(struct task_struct *task, */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; - if (unlikely(idx >= ldt->nr_entries)) + if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c663d5fcff2e..b7663a1f89ee 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, .ident = "Apple MacBookPro5", diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index ab0176ae985b..12f90f17f4f6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -24,10 +24,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; -#endif - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a9a55e76a43f..a5e2ce931f69 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -346,8 +346,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) @@ -550,6 +553,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp; state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 2384a2ae5ec3..d3dc8bc6b3ad 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -36,13 +36,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) ENTRY(phys_startup_32) -jiffies = jiffies_64; #else OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) -jiffies_64 = jiffies; #endif +jiffies = jiffies_64; + #if defined(CONFIG_X86_64) /* * On 64-bit, align RODATA to 2MB so we retain large page mappings for @@ -352,7 +352,8 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) - *(.bss) + . = ALIGN(PAGE_SIZE); + *(BSS_MAIN) . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f500293dad8d..ce4a9f1f845e 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -5,7 +5,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 537c36b55b5d..d4fdf0e52144 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1918,7 +1918,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || + if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5af08b58132..1cceee0ed580 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -263,6 +263,11 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; */ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -275,11 +280,18 @@ static bool is_executable_pte(u64 spte); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { BUG_ON((mmio_mask & mmio_value) != mmio_value); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_value; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -287,7 +299,7 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return !(spte & shadow_acc_track_value); } @@ -298,13 +310,13 @@ static bool is_nx_huge_page_enabled(void) static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -374,11 +386,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mmu_spte_set(sptep, mask); } -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_value; -} - static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; @@ -443,6 +450,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected + * in CPU detection code, but MKTME treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore for MKTME + * we should still return physical address bits reported by CPUID. + */ + if (!boot_cpu_has(X86_FEATURE_TME) || + WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) + return boot_cpu_data.x86_phys_bits; + + return cpuid_eax(0x80000008) & 0xff; +} + static void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; @@ -456,20 +478,29 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_phys_bits = kvm_get_shadow_phys_bits(); + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. */ + shadow_nonpresent_or_rsvd_mask = 0; low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_data.x86_phys_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) { + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_phys_bits - - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_phys_bits - 1); - low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); } + shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } @@ -1682,10 +1713,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * Emulate arch specific page modification logging for the * nested hypervisor */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) { if (kvm_x86_ops->write_log_dirty) - return kvm_x86_ops->write_log_dirty(vcpu); + return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa); return 0; } @@ -4213,7 +4244,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); @@ -4305,7 +4336,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4342,13 +4373,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, false); if (!shadow_me_mask) @@ -4369,7 +4400,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + shadow_phys_bits, execonly); } #define BYTE_MASK(access) \ @@ -5666,6 +5697,25 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) return 0; } +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask); +} + int kvm_mmu_module_init(void) { if (nx_huge_pages == -1) @@ -5673,6 +5723,8 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); + kvm_set_mmio_spte_mask(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 068feab64acf..816a626b6250 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -194,7 +194,7 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 918b0d5bf272..1c1c2649829b 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -339,7 +339,7 @@ TRACE_EVENT( /* These depend on page entry type, so compute them now. */ __field(bool, r) __field(bool, x) - __field(u8, u) + __field(signed char, u) ), TP_fast_assign( diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8cf7a09bdd73..7260a165488d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -202,7 +202,7 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, - int write_fault) + gpa_t addr, int write_fault) { unsigned level, index; pt_element_t pte, orig_pte; @@ -227,7 +227,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu)) + if (kvm_arch_write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -424,7 +424,8 @@ retry_walk: (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, + addr, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d63621386418..78826d123fb8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2757,8 +2757,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_HOST; break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) + /* Trap async PF even if not shadowing */ + if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason) return NESTED_EXIT_HOST; break; default: @@ -2847,7 +2847,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; + /* asid not copied, it is handled manually for svm->vmcb. */ dst->tlb_ctl = from->tlb_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c139dedec12b..396d41f192ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5592,6 +5592,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { + BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; @@ -8711,7 +8713,7 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); - switch (exit_reason) { + switch ((u16)exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (is_nmi(intr_info)) return false; @@ -9142,6 +9144,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -9280,15 +9283,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -12461,11 +12465,10 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; struct page *page = NULL; u64 *pml_address; @@ -12486,7 +12489,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) return 1; } - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + gpa &= ~0xFFFull; page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); if (is_error_page(page)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1477e23827c2..79fa55de635c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,6 +806,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, @@ -2344,7 +2346,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); @@ -2629,7 +2631,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: @@ -3073,6 +3075,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; + if (vcpu->arch.st.steal.preempted) + return; + vcpu->arch.st.steal.preempted = 1; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, @@ -4365,10 +4370,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); +set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@ -4388,10 +4396,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); +set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@ -6291,35 +6302,6 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -6397,8 +6379,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 75d3776123cc..2c3b4bcbe8f2 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -118,7 +118,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) */ if (size < 8) { if (!IS_ALIGNED(dest, 4) || size != 4) - clean_cache_range(dst, 1); + clean_cache_range(dst, size); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index f031c0e19356..515cdee90df7 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -209,7 +209,7 @@ sqrt_stage_2_finish: #ifdef PARANOID /* It should be possible to get here only if the arg is ffff....ffff */ - cmp $0xffffffff,FPU_fsqrt_arg_1 + cmpl $0xffffffff,FPU_fsqrt_arg_1 jnz sqrt_stage_2_error #endif /* PARANOID */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 32bb38f6fc18..8039a951db8f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,8 +112,6 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; - printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", - pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 4d434ddb75db..f140b2d39319 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -385,7 +385,7 @@ static void enter_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL && + if (!cpumask_available(downed_cpus) && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { pr_notice("Failed to allocate mask\n"); goto out; @@ -415,7 +415,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 33e9b4f1ce20..c177da94fc79 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -572,6 +572,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); /* * Device [1022:7808] diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 03fc397335b7..c9fc725a1dcf 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -171,9 +171,10 @@ static struct irq_domain *uv_get_irq_domain(void) goto out; uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL); - irq_domain_free_fwnode(fn); if (uv_domain) uv_domain->parent = x86_vector_domain; + else + irq_domain_free_fwnode(fn); out: mutex_unlock(&uv_lock); |