aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/head_32.S7
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S14
-rw-r--r--arch/x86/entry/calling.h38
-rw-r--r--arch/x86/entry/entry_32.S1
-rw-r--r--arch/x86/entry/entry_64.S7
-rw-r--r--arch/x86/entry/entry_64_compat.S8
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/events/amd/ibs.c61
-rw-r--r--arch/x86/events/amd/iommu.c6
-rw-r--r--arch/x86/events/core.c1
-rw-r--r--arch/x86/events/intel/cstate.c6
-rw-r--r--arch/x86/events/intel/ds.c2
-rw-r--r--arch/x86/events/intel/pt.c6
-rw-r--r--arch/x86/events/intel/rapl.c14
-rw-r--r--arch/x86/events/intel/uncore.c4
-rw-r--r--arch/x86/events/intel/uncore.h12
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/apic.h10
-rw-r--r--arch/x86/include/asm/barrier.h18
-rw-r--r--arch/x86/include/asm/bitops.h29
-rw-r--r--arch/x86/include/asm/cpu_device_id.h27
-rw-r--r--arch/x86/include/asm/cpufeatures.h5
-rw-r--r--arch/x86/include/asm/dma.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h49
-rw-r--r--arch/x86/include/asm/insn.h15
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nospec-branch.h5
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/percpu.h2
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h12
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/pkeys.h5
-rw-r--r--arch/x86/include/asm/processor.h11
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/stackprotector.h7
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/thread_info.h23
-rw-r--r--arch/x86/include/asm/uaccess.h12
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/apic/apic.c31
-rw-r--r--arch/x86/kernel/apic/io_apic.c26
-rw-r--r--arch/x86/kernel/apic/msi.c16
-rw-r--r--arch/x86/kernel/apic/vector.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c6
-rw-r--r--arch/x86/kernel/cpu/bugs.c214
-rw-r--r--arch/x86/kernel/cpu/common.c56
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c68
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h6
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c27
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c304
-rw-r--r--arch/x86/kernel/cpu/match.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c63
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/fpu/init.c30
-rw-r--r--arch/x86/kernel/fpu/regset.c2
-rw-r--r--arch/x86/kernel/fpu/signal.c31
-rw-r--r--arch/x86/kernel/fpu/xstate.c133
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c3
-rw-r--r--arch/x86/kernel/kprobes/core.c38
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/process.c28
-rw-r--r--arch/x86/kernel/process.h2
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/reboot.c49
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/unwind_orc.c30
-rw-r--r--arch/x86/kernel/uprobes.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S7
-rw-r--r--arch/x86/kvm/cpuid.c11
-rw-r--r--arch/x86/kvm/emulate.c10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c106
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h21
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c33
-rw-r--r--arch/x86/kvm/vmx.c125
-rw-r--r--arch/x86/kvm/x86.c98
-rw-r--r--arch/x86/lib/msr-smp.c4
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/math-emu/wm_sqrt.S2
-rw-r--r--arch/x86/mm/ident_map.c12
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64.c6
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c4
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/pgtable.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c29
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/platform/uv/uv_irq.c3
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/tools/chkobjdump.awk1
-rw-r--r--arch/x86/tools/relocs.c12
-rw-r--r--arch/x86/xen/enlighten_pv.c31
-rw-r--r--arch/x86/xen/p2m.c63
-rw-r--r--arch/x86/xen/smp_pv.c1
-rw-r--r--arch/x86/xen/spinlock.c12
127 files changed, 1538 insertions, 878 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c55870ac907e..6301a8d2b87e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -526,6 +526,7 @@ config X86_UV
depends on X86_EXTENDED_PLATFORM
depends on NUMA
depends on EFI
+ depends on KEXEC_CORE
depends on X86_X2APIC
depends on PCI
---help---
@@ -1448,6 +1449,7 @@ config ARCH_HAS_MEM_ENCRYPT
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
+ select ARCH_USE_MEMREMAP_PROT
---help---
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1455,7 +1457,6 @@ config AMD_MEM_ENCRYPT
config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
bool "Activate AMD Secure Memory Encryption (SME) by default"
- default y
depends on AMD_MEM_ENCRYPT
---help---
Say yes to have system memory encrypted by default if running on
@@ -1467,10 +1468,6 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
If set to N, then the encryption of system memory can be
activated with the mem_encrypt=on command line option.
-config ARCH_USE_MEMREMAP_PROT
- def_bool y
- depends on AMD_MEM_ENCRYPT
-
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
@@ -1903,6 +1900,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3dc54d2f79c4..844d5a72d2ad 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -35,12 +35,13 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
-DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
- -mno-mmx -mno-sse
+ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
+REALMODE_CFLAGS += $(CLANG_FLAGS)
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -62,6 +63,9 @@ endif
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+# Intel CET isn't enabled in the kernel
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 1c060748c813..f38ffcc610d2 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -87,7 +87,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 3a250ca2406c..644f9e14cb09 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -36,6 +36,8 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+# Disable relocation relaxation in case the link is not PIE.
+KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 37380c0d5999..c6c4b877f3d2 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -49,16 +49,17 @@
* Position Independent Executable (PIE) so that linker won't optimize
* R_386_GOT32X relocation to its fixed symbol address. Older
* linkers generate R_386_32 relocations against locally defined symbols,
- * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less
+ * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less
* optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle
* R_386_32 relocations when relocating the kernel. To generate
- * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as
+ * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as
* hidden:
*/
.hidden _bss
.hidden _ebss
.hidden _got
.hidden _egot
+ .hidden _end
__HEAD
ENTRY(startup_32)
@@ -106,7 +107,7 @@ ENTRY(startup_32)
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
- jge 1f
+ jae 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
1:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 39fdede523f2..7ab1c6bcc66a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -41,6 +41,7 @@
.hidden _ebss
.hidden _got
.hidden _egot
+ .hidden _end
__HEAD
.code32
@@ -105,7 +106,7 @@ ENTRY(startup_32)
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
- jge 1f
+ jae 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
1:
@@ -280,7 +281,7 @@ ENTRY(startup_64)
notq %rax
andq %rax, %rbp
cmpq $LOAD_PHYSICAL_ADDR, %rbp
- jge 1f
+ jae 1f
#endif
movq $LOAD_PHYSICAL_ADDR, %rbp
1:
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 0eb9f92f3717..136972a4c454 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -217,7 +217,6 @@ CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
CONFIG_FB_EFI=y
# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
@@ -247,6 +246,7 @@ CONFIG_USB_HIDDEV=y
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e32fc1f274d8..d387193ef7fc 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -213,7 +213,6 @@ CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
CONFIG_FB_EFI=y
# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
@@ -243,6 +242,7 @@ CONFIG_USB_HIDDEV=y
CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 5f6a5af9c489..77043a82da51 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -127,10 +127,6 @@ ddq_add_8:
/* generate a unique variable for ddq_add_x */
-.macro setddq n
- var_ddq_add = ddq_add_\n
-.endm
-
/* generate a unique variable for xmm register */
.macro setxdata n
var_xdata = %xmm\n
@@ -140,9 +136,7 @@ ddq_add_8:
.macro club name, id
.altmacro
- .if \name == DDQ_DATA
- setddq %\id
- .elseif \name == XDATA
+ .if \name == XDATA
setxdata %\id
.endif
.noaltmacro
@@ -165,9 +159,8 @@ ddq_add_8:
.set i, 1
.rept (by - 1)
- club DDQ_DATA, i
club XDATA, i
- vpaddq var_ddq_add(%rip), xcounter, var_xdata
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
vptest ddq_low_msk(%rip), var_xdata
jnz 1f
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
@@ -180,8 +173,7 @@ ddq_add_8:
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
- club DDQ_DATA, by
- vpaddq var_ddq_add(%rip), xcounter, xcounter
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
vptest ddq_low_msk(%rip), xcounter
jnz 1f
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 557c1bdda311..1dbc62a96b85 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
- /*
- * Push registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -117,29 +110,40 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rcx /* pt_regs->cx */
pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
- xorl %r8d, %r8d /* nospec r8 */
pushq %r9 /* pt_regs->r9 */
- xorl %r9d, %r9d /* nospec r9 */
pushq %r10 /* pt_regs->r10 */
- xorl %r10d, %r10d /* nospec r10 */
pushq %r11 /* pt_regs->r11 */
- xorl %r11d, %r11d /* nospec r11*/
pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx*/
pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp*/
pushq %r12 /* pt_regs->r12 */
- xorl %r12d, %r12d /* nospec r12*/
pushq %r13 /* pt_regs->r13 */
- xorl %r13d, %r13d /* nospec r13*/
pushq %r14 /* pt_regs->r14 */
- xorl %r14d, %r14d /* nospec r14*/
pushq %r15 /* pt_regs->r15 */
- xorl %r15d, %r15d /* nospec r15*/
UNWIND_HINT_REGS
+
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
+
+ /*
+ * Sanitize registers of values that a speculation attack might
+ * otherwise want to exploit. The lower registers are likely clobbered
+ * well before they could be put to use in a speculative execution
+ * gadget.
+ */
+ xorl %edx, %edx /* nospec dx */
+ xorl %ecx, %ecx /* nospec cx */
+ xorl %r8d, %r8d /* nospec r8 */
+ xorl %r9d, %r9d /* nospec r9 */
+ xorl %r10d, %r10d /* nospec r10 */
+ xorl %r11d, %r11d /* nospec r11 */
+ xorl %ebx, %ebx /* nospec rbx */
+ xorl %ebp, %ebp /* nospec rbp */
+ xorl %r12d, %r12d /* nospec r12 */
+ xorl %r13d, %r13d /* nospec r13 */
+ xorl %r14d, %r14d /* nospec r14 */
+ xorl %r15d, %r15d /* nospec r15 */
+
.endm
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 49adabd94f88..c19974a49378 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1057,6 +1057,7 @@ ENTRY(int3)
END(int3)
ENTRY(general_protection)
+ ASM_CLAC
pushl $do_general_protection
jmp common_exception
END(general_protection)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 5ec66fafde4e..ac389ffb1822 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -55,7 +55,7 @@ END(native_usergs_sysret64)
.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9, EFLAGS(%rsp) /* interrupts off? */
+ btl $9, EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON
1:
@@ -302,7 +302,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- UNWIND_HINT_EMPTY
POP_REGS pop_rdi=0 skip_r11rcx=1
/*
@@ -311,6 +310,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -606,6 +606,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -1648,7 +1649,7 @@ ENTRY(rewind_stack_do_exit)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
- UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
+ UNWIND_HINT_REGS
call do_exit
END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 364ea4a207be..304e3daf82dd 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -357,13 +357,13 @@ ENTRY(entry_INT80_compat)
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r8 */
xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
+ pushq %r9 /* pt_regs->r9 */
xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
+ pushq %r10 /* pt_regs->r10*/
xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
+ pushq %r11 /* pt_regs->r11 */
xorl %r11d, %r11d /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
xorl %ebx, %ebx /* nospec rbx */
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index a77fd3c8d824..2ab8628aef10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -342,7 +342,7 @@ static void vgetcpu_cpu_init(void *arg)
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
#endif
- if (static_cpu_has(X86_FEATURE_RDTSCP))
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
write_rdtscp_aux((node << 12) | cpu);
/*
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index f24e9adaa316..5043425ced6b 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -89,6 +89,8 @@ struct perf_ibs {
u64 max_period;
unsigned long offset_mask[1];
int offset_max;
+ unsigned int fetch_count_reset_broken : 1;
+ unsigned int fetch_ignore_if_zero_rip : 1;
struct cpu_perf_ibs __percpu *pcpu;
struct attribute **format_attrs;
@@ -346,11 +348,15 @@ static u64 get_ibs_op_count(u64 config)
{
u64 count = 0;
+ /*
+ * If the internal 27-bit counter rolled over, the count is MaxCnt
+ * and the lower 7 bits of CurCnt are randomized.
+ * Otherwise CurCnt has the full 27-bit current counter value.
+ */
if (config & IBS_OP_VAL)
- count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
- if (ibs_caps & IBS_CAPS_RDWROPCNT)
- count += (config & IBS_OP_CUR_CNT) >> 32;
+ count = (config & IBS_OP_MAX_CNT) << 4;
+ else if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count = (config & IBS_OP_CUR_CNT) >> 32;
return count;
}
@@ -375,7 +381,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
- wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+ u64 tmp = hwc->config | config;
+
+ if (perf_ibs->fetch_count_reset_broken)
+ wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
+
+ wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
}
/*
@@ -637,18 +648,24 @@ fail:
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
+ /*
+ * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
+ * depending on their availability.
+ * Can't add to offset_max as they are staggered
+ */
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
- /*
- * Read IbsBrTarget and IbsOpData4 separately
- * depending on their availability.
- * Can't add to offset_max as they are staggered
- */
- if (ibs_caps & IBS_CAPS_BRNTRGT) {
- rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
- size++;
+ if (perf_ibs == &perf_ibs_op) {
+ if (ibs_caps & IBS_CAPS_BRNTRGT) {
+ rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+ size++;
+ }
+ if (ibs_caps & IBS_CAPS_OPDATA4) {
+ rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+ size++;
+ }
}
- if (ibs_caps & IBS_CAPS_OPDATA4) {
- rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+ if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
+ rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
size++;
}
}
@@ -658,6 +675,10 @@ fail:
if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else {
+ /* Workaround for erratum #1197 */
+ if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
+ goto out;
+
set_linear_ip(&regs, ibs_data.regs[1]);
regs.flags |= PERF_EFLAGS_EXACT;
}
@@ -744,6 +765,16 @@ static __init void perf_event_ibs_init(void)
{
struct attribute **attr = ibs_op_format_attrs;
+ /*
+ * Some chips fail to reset the fetch count when it is written; instead
+ * they need a 0-1 transition of IbsFetchEn.
+ */
+ if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
+ perf_ibs_fetch.fetch_count_reset_broken = 1;
+
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
+ perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
+
perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
if (ibs_caps & IBS_CAPS_OPCNT) {
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 3641e24fdac5..5a372b8902f4 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -84,12 +84,12 @@ static struct attribute_group amd_iommu_events_group = {
};
struct amd_iommu_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *event;
};
-static ssize_t _iommu_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t _iommu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct amd_iommu_event_desc *event =
container_of(attr, struct amd_iommu_event_desc, attr);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index c1f7b3cb84a9..39c298afa2ea 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2094,6 +2094,7 @@ static int x86_pmu_event_init(struct perf_event *event)
if (err) {
if (event->destroy)
event->destroy(event);
+ event->destroy = NULL;
}
if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 59521c71c98a..4ebeaa3c6710 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -99,14 +99,14 @@
MODULE_LICENSE("GPL");
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __cstate_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
static ssize_t cstate_get_attr_cpumask(struct device *dev,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 550b7814ef92..49dd12997a21 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1515,7 +1515,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
*/
if (!pebs_status && cpuc->pebs_enabled &&
!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
- pebs_status = cpuc->pebs_enabled;
+ pebs_status = p->status = cpuc->pebs_enabled;
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 81fd41d5a0d9..990ca9614b23 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -69,7 +69,7 @@ static struct pt_cap_desc {
PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
- PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
@@ -1190,7 +1190,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
if (!filter->range || !filter->size)
return -EOPNOTSUPP;
- if (!filter->inode) {
+ if (!filter->path.dentry) {
if (!valid_kernel_ip(filter->offset))
return -EINVAL;
@@ -1217,7 +1217,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;
list_for_each_entry(filter, &head->list, entry) {
- if (filter->inode && !offs[range]) {
+ if (filter->path.dentry && !offs[range]) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index d36a5fac6a18..7d3f861e3165 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -115,18 +115,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
- char *page) \
-{ \
- BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
- return sprintf(page, _format "\n"); \
-} \
-static struct kobj_attribute format_attr_##_var = \
- __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
#define RAPL_CNTR_WIDTH 32
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
@@ -548,7 +536,7 @@ static struct attribute_group rapl_pmu_events_group = {
.attrs = NULL, /* patched at runtime */
};
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+PMU_FORMAT_ATTR(event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c56cb37b88e3..4ad93871508c 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -90,8 +90,8 @@ end:
return map;
}
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct uncore_event_desc *event =
container_of(attr, struct uncore_event_desc, attr);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 4364191e7c6b..ae4ab89e16c7 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -125,7 +125,7 @@ struct intel_uncore_box {
#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */
struct uncore_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *config;
};
@@ -137,8 +137,8 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf);
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
@@ -147,14 +147,14 @@ ssize_t uncore_event_show(struct kobject *kobj,
}
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 6b66285c6ced..b039d416fef3 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3383,6 +3383,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
struct extra_reg *er;
int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
if (er->event != (event->hw.config & er->config_mask))
@@ -3450,6 +3453,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index bfe16631fd1d..ccd14aca30e7 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -792,9 +792,10 @@ void x86_pmu_stop(struct perf_event *event, int flags);
static inline void x86_pmu_disable_event(struct perf_event *event)
{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
struct hw_perf_event *hwc = &event->hw;
- wrmsrl(hwc->config_base, hwc->config);
+ wrmsrl(hwc->config_base, hwc->config & ~disable_mask);
}
void x86_pmu_enable_event(struct perf_event *event);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 25a5a5c6ae90..95f59f5dffb3 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -174,16 +174,6 @@ static inline void lapic_update_tsc_freq(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_X2APIC
-/*
- * Make previous memory operations globally visible before
- * sending the IPI through x2apic wrmsr. We need a serializing instruction or
- * mfence for this.
- */
-static inline void x2apic_wrmsr_fence(void)
-{
- asm volatile("mfence" : : : "memory");
-}
-
static inline void native_apic_msr_write(u32 reg, u32 v)
{
if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index bc88797cfa61..3895a7b6952b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -111,4 +111,22 @@ do { \
#include <asm-generic/barrier.h>
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ asm volatile("mfence; lfence" : : : "memory");
+}
+
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 3fa039855b8f..9f645ba57dbb 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -78,7 +78,7 @@ set_bit(long nr, volatile unsigned long *addr)
: "iq" ((u8)CONST_MASK(nr))
: "memory");
} else {
- asm volatile(LOCK_PREFIX "bts %1,%0"
+ asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
: BITOP_ADDR(addr) : "Ir" (nr) : "memory");
}
}
@@ -94,7 +94,7 @@ set_bit(long nr, volatile unsigned long *addr)
*/
static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
+ asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory");
}
/**
@@ -115,7 +115,7 @@ clear_bit(long nr, volatile unsigned long *addr)
: CONST_MASK_ADDR(nr, addr)
: "iq" ((u8)~CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX "btr %1,%0"
+ asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
: BITOP_ADDR(addr)
: "Ir" (nr));
}
@@ -137,7 +137,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad
static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr));
}
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
@@ -182,7 +182,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *
*/
static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr));
}
/**
@@ -201,7 +201,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr)
: CONST_MASK_ADDR(nr, addr)
: "iq" ((u8)CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX "btc %1,%0"
+ asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
: BITOP_ADDR(addr)
: "Ir" (nr));
}
@@ -217,7 +217,8 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr)
*/
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c);
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts),
+ *addr, "Ir", nr, "%0", c);
}
/**
@@ -246,7 +247,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
{
bool oldbit;
- asm("bts %2,%1"
+ asm(__ASM_SIZE(bts) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@@ -263,7 +264,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
*/
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c);
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr),
+ *addr, "Ir", nr, "%0", c);
}
/**
@@ -286,7 +288,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
{
bool oldbit;
- asm volatile("btr %2,%1"
+ asm volatile(__ASM_SIZE(btr) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@@ -298,7 +300,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
{
bool oldbit;
- asm volatile("btc %2,%1"
+ asm volatile(__ASM_SIZE(btc) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
@@ -316,7 +318,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
*/
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
- GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c);
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc),
+ *addr, "Ir", nr, "%0", c);
}
static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
@@ -329,7 +332,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
{
bool oldbit;
- asm volatile("bt %2,%1"
+ asm volatile(__ASM_SIZE(bt) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index baeba0567126..884466592943 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -9,6 +9,33 @@
#include <linux/mod_devicetable.h>
+#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
+
+/**
+ * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
+ * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@_vendor
+ * @_family: The family number or X86_FAMILY_ANY
+ * @_model: The model number, model constant or X86_MODEL_ANY
+ * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY
+ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
+ * @_data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ *
+ * Backport version to keep the SRBDS pile consistant. No shorter variants
+ * required for this.
+ */
+#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = X86_VENDOR_##_vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .driver_data = (unsigned long) _data \
+}
+
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
#endif
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b4bef819d5d5..e08866cd2287 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -205,7 +205,7 @@
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
-
+#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
@@ -291,6 +291,7 @@
#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
@@ -346,6 +347,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
@@ -390,5 +392,6 @@
#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 00f7cf45e699..8e95aa4b0d17 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -74,7 +74,7 @@
#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
-#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))
#ifdef CONFIG_X86_32
/* The maximum address that we can perform a DMA transfer to on this platform */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fa2c93cb42a2..4f274d851986 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -103,6 +103,7 @@ static inline void fpstate_init_fxstate(struct fxregs_state *fx)
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);
+/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...) \
({ \
int err; \
@@ -110,14 +111,14 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
might_fault(); \
\
asm volatile(ASM_STAC "\n" \
- "1:" #insn "\n\t" \
+ "1: " #insn "\n" \
"2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
+ "3: negl %%eax\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err), output \
: "0"(0), input); \
err; \
})
@@ -214,6 +215,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
}
}
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -221,16 +230,20 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+/*
+ * After this @err contains 0 on success or the negated trap number when
+ * the operation raises an exception. For faults this results in -EFAULT.
+ */
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
"2:\n\t" \
".pushsection .fixup,\"ax\"\n\t" \
- "3: movl $-2,%[err]\n\t" \
+ "3: negl %%eax\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err) \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -282,28 +295,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
*/
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = -1;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (static_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
{
u64 mask = -1;
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index c2c01f84df75..3e0e18d376d2 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -208,6 +208,21 @@ static inline int insn_offset_immediate(struct insn *insn)
return insn_offset_displacement(insn) + insn->displacement.nbytes;
}
+/**
+ * for_each_insn_prefix() -- Iterate prefixes in the instruction
+ * @insn: Pointer to struct insn.
+ * @idx: Index storage.
+ * @prefix: Prefix byte.
+ *
+ * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
+ * and the index is stored in @idx (note that this @idx is just for a cursor,
+ * do not change it.)
+ * Since prefixes.nbytes can be bigger than 4 if some prefixes
+ * are repeated, it cannot be used for looping over the prefixes.
+ */
+#define for_each_insn_prefix(insn, idx, prefix) \
+ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+
#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2cdf654ed132..ecb6009a2c8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1032,7 +1032,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
- int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+ int (*check_nested_events)(struct kvm_vcpu *vcpu);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
@@ -1060,7 +1060,7 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
- int (*write_log_dirty)(struct kvm_vcpu *vcpu);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 209492849566..5c524d4f71cd 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -41,7 +41,7 @@ struct microcode_amd {
unsigned int mpb[0];
};
-#define PATCH_MAX_SIZE PAGE_SIZE
+#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
#ifdef CONFIG_MICROCODE_AMD
extern void __init load_ucode_amd_bsp(unsigned int family);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5761a86b88e0..e8ffeebd24b6 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -110,6 +110,10 @@
#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+/* SRBDS support */
+#define MSR_IA32_MCU_OPT_CTRL 0x00000123
+#define RNGDS_MITG_DIS BIT(0)
+
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
#define MSR_IA32_SYSENTER_EIP 0x00000176
@@ -373,6 +377,7 @@
#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
#define MSR_AMD64_IBSCTL 0xc001103a
#define MSR_AMD64_IBSBRTARGET 0xc001103b
+#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 30df295f6d94..18f9a9b7280b 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -88,7 +88,7 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
* think of extending them - you will be slapped with a stinking trout or a frozen
* shark will reach you, wherever you are! You've been warned.
*/
-static inline unsigned long long notrace __rdmsr(unsigned int msr)
+static __always_inline unsigned long long __rdmsr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
@@ -100,7 +100,7 @@ static inline unsigned long long notrace __rdmsr(unsigned int msr)
return EAX_EDX_VAL(val, low, high);
}
-static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high)
+static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index b73a16a56e4f..270448b178a7 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -232,6 +232,7 @@ enum spectre_v2_mitigation {
enum spectre_v2_user_mitigation {
SPECTRE_V2_USER_NONE,
SPECTRE_V2_USER_STRICT,
+ SPECTRE_V2_USER_STRICT_PREFERRED,
SPECTRE_V2_USER_PRCTL,
SPECTRE_V2_USER_SECCOMP,
};
@@ -329,7 +330,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
-static inline void mds_clear_cpu_buffers(void)
+static __always_inline void mds_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
@@ -350,7 +351,7 @@ static inline void mds_clear_cpu_buffers(void)
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static inline void mds_user_clear_cpu_buffers(void)
+static __always_inline void mds_user_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_user_clear))
mds_clear_cpu_buffers();
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 50c8baaca4b0..4c807635e624 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -20,7 +20,7 @@
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
-#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 12aa2bb6bac4..6abf3af96fc8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
{
bool oldbit;
- asm volatile("bt "__percpu_arg(2)",%1"
+ asm volatile("btl "__percpu_arg(2)",%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 876b4c77d983..6a59a6d0cc50 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -44,5 +44,6 @@ typedef union {
*/
#define PTRS_PER_PTE 512
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6a4b1a54ff47..2711b63945ae 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -234,6 +234,7 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
static inline int pmd_trans_huge(pmd_t pmd)
{
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
@@ -588,12 +589,15 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return __pmd(val);
}
-/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+/*
+ * mprotect needs to preserve PAT and encryption bits when updating
+ * vm_page_prot
+ */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
- pgprotval_t addbits = pgprot_val(newprot);
+ pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
return __pgprot(preservebits | addbits);
}
@@ -1216,8 +1220,8 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
#endif
#endif
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index bf6d2692fc60..2bd79b7ae9d6 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -40,6 +40,8 @@ typedef struct { pteval_t pte; } pte_t;
#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 52
+
#else /* CONFIG_X86_5LEVEL */
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 85f8279c885a..7b0596b79d71 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -124,7 +124,7 @@
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
@@ -148,6 +148,7 @@ enum page_cache_mode {
#endif
#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 851c04b7a092..1572a436bc08 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -4,6 +4,11 @@
#define ARCH_DEFAULT_PKEY 0
+/*
+ * If more than 16 keys are ever supported, a thorough audit
+ * will be necessary to ensure that the types that store key
+ * numbers and masks have sufficient capacity.
+ */
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a87eda9691e..be441d520d63 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -344,7 +344,7 @@ struct x86_hw_tss {
#define INVALID_IO_BITMAP_OFFSET 0x8000
struct entry_stack {
- unsigned long words[64];
+ char stack[PAGE_SIZE];
};
struct entry_stack_page {
@@ -512,15 +512,6 @@ struct thread_struct {
};
/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-
-/*
* Set IOPL bits in EFLAGS from given mask
*/
static inline void native_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6e81788a30c1..0eaca7a130c9 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -4,6 +4,8 @@
#include <asm/ldt.h>
+struct task_struct;
+
/* misc architecture specific prototypes */
void syscall_init(void);
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 371b3a4af000..55b72ea5e01d 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -55,8 +55,13 @@
/*
* Initialize the stackprotector canary value.
*
- * NOTE: this must only be called from functions that never return,
+ * NOTE: this must only be called from functions that never return
* and it must always be inlined.
+ *
+ * In addition, it should be called from a compilation unit for which
+ * stack protector is disabled. Alternatively, the caller should not end
+ * with a function call which gets tail-call optimized as that would
+ * lead to checking a modified canary value.
*/
static __always_inline void boot_init_stack_canary(void)
{
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 78dd9df88157..2a9e81e93aac 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -117,6 +117,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_IGN_TPR_SHIFT 20
#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK)
+
#define V_INTR_MASKING_SHIFT 24
#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index bf9175d87844..a77f0ad96d94 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -229,10 +229,31 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+
+#ifndef __ASSEMBLY__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
+#define TS_COMPAT_RESTART 0x0008
+
+#define arch_set_restart_data arch_set_restart_data
+
+static inline void arch_set_restart_data(struct restart_block *restart)
+{
+ struct thread_info *ti = current_thread_info();
+ if (ti->status & TS_COMPAT)
+ ti->status |= TS_COMPAT_RESTART;
+ else
+ ti->status &= ~TS_COMPAT_RESTART;
+}
#endif
-#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 971830341061..82b0ff6cac97 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -711,7 +711,17 @@ extern struct movsl_mask {
* checking before using them, but you have to surround them with the
* user_access_begin/end() pair.
*/
-#define user_access_begin() __uaccess_begin()
+static __must_check inline bool user_access_begin(int type,
+ const void __user *ptr,
+ size_t len)
+{
+ if (unlikely(!access_ok(type, ptr, len)))
+ return 0;
+ __uaccess_begin_nospec();
+ return 1;
+}
+
+#define user_access_begin(a, b, c) user_access_begin(a, b, c)
#define user_access_end() __uaccess_end()
#define unsafe_put_user(x, ptr, err_label) \
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6dda3595acf8..3613681b49cf 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1553,10 +1553,18 @@ void __init acpi_boot_table_init(void)
/*
* Initialize the ACPI boot-time table parser.
*/
- if (acpi_table_init()) {
+ if (acpi_locate_initial_tables())
disable_acpi();
- return;
- }
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1569,18 +1577,9 @@ void __init acpi_boot_table_init(void)
} else {
printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
disable_acpi();
- return;
+ return 1;
}
}
-}
-
-int __init early_acpi_boot_init(void)
-{
- /*
- * If acpi_disabled, bail out
- */
- if (acpi_disabled)
- return 1;
/*
* Process the Multiple APIC Description Table (MADT), if present
@@ -1738,7 +1737,7 @@ int __acpi_acquire_global_lock(unsigned int *lock)
new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
val = cmpxchg(lock, old, new);
} while (unlikely (val != old));
- return (new < 3) ? -1 : 0;
+ return ((new & 0x3) < 3) ? -1 : 0;
}
int __acpi_release_global_lock(unsigned int *lock)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index dde437f5d14f..596e7640d895 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -133,7 +133,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
/* Make sure we are running on right CPU */
- retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx,
+ false);
if (retval == 0) {
/* Use the hint in CST */
percpu_entry->states[cx->index].eax = cx->address;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6415b4aead54..76f2bbba92f9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -42,6 +42,7 @@
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <linux/atomic.h>
+#include <asm/barrier.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
#include <asm/proto.h>
@@ -353,8 +354,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -475,6 +474,9 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
tsc = rdtsc();
wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
@@ -553,7 +555,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
#define DEADLINE_MODEL_MATCH_REV(model, rev) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
-static u32 hsx_deadline_rev(void)
+static __init u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
@@ -563,7 +565,7 @@ static u32 hsx_deadline_rev(void)
return ~0U;
}
-static u32 bdx_deadline_rev(void)
+static __init u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
@@ -575,7 +577,7 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
-static u32 skx_deadline_rev(void)
+static __init u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
@@ -588,7 +590,7 @@ static u32 skx_deadline_rev(void)
return ~0U;
}
-static const struct x86_cpu_id deadline_match[] = {
+static const struct x86_cpu_id deadline_match[] __initconst = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
@@ -610,18 +612,19 @@ static const struct x86_cpu_id deadline_match[] = {
{},
};
-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;
/*
* Function pointers will have the MSB set due to address layout,
@@ -633,11 +636,12 @@ static void apic_check_deadline_errata(void)
rev = (u32)m->driver_data;
if (boot_cpu_data.microcode >= rev)
- return;
+ return true;
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}
/*
@@ -1914,7 +1918,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
- apic_check_deadline_errata();
+ if (apic_validate_deadline_timer())
+ pr_info("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2271adbc3c42..de74bca6a8ff 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1042,6 +1042,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
legacy = mp_is_legacy_irq(irq);
+ /*
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
+ */
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
}
mutex_lock(&ioapic_mutex);
@@ -2160,6 +2170,7 @@ static inline void __init check_timer(void)
legacy_pic->init(0);
legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
unlock_ExtINT_logic();
@@ -2233,12 +2244,12 @@ static int mp_irqdomain_create(int ioapic)
ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
(void *)(long)ioapic);
- /* Release fw handle if it was allocated above */
- if (!cfg->dev)
- irq_domain_free_fwnode(fn);
-
- if (!ip->irqdomain)
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
return -ENOMEM;
+ }
ip->irqdomain->parent = parent;
@@ -2252,8 +2263,13 @@ static int mp_irqdomain_create(int ioapic)
static void ioapic_destroy_irqdomain(int idx)
{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index f10e7f93b0e2..8c102d62b859 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -149,10 +149,11 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
msi_default_domain =
pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
parent);
- irq_domain_free_fwnode(fn);
}
- if (!msi_default_domain)
+ if (!msi_default_domain) {
+ irq_domain_free_fwnode(fn);
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ }
}
#ifdef CONFIG_IRQ_REMAP
@@ -185,7 +186,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
if (!fn)
return NULL;
d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d)
+ irq_domain_free_fwnode(fn);
return d;
}
#endif
@@ -248,7 +250,8 @@ static struct irq_domain *dmar_get_irq_domain(void)
if (fn) {
dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
- irq_domain_free_fwnode(fn);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
}
out:
mutex_unlock(&dmar_lock);
@@ -373,7 +376,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
}
d = msi_create_irq_domain(fn, domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
return d;
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b958082c74a7..637cf4dfccc9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -368,6 +368,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irq_data);
+
err = assign_irq_vector_policy(virq + i, node, data, info,
irq_data);
if (err) {
@@ -457,7 +461,6 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_domain_free_fwnode(fn);
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e216cf3d64d2..32c5dba6f1bb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
@@ -42,7 +43,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long flags;
u32 dest;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index b94d35320f85..98716a4be0a7 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -41,7 +41,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
@@ -52,7 +53,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long this_cpu;
unsigned long flags;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7896a34f53b5..fcea3aa19c23 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
static void __init mds_print_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init srbds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -60,7 +61,7 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
u64 __ro_after_init x86_amd_ls_cfg_base;
u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
-/* Control conditional STIPB in switch_to() */
+/* Control conditional STIBP in switch_to() */
DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
/* Control conditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
@@ -108,6 +109,7 @@ void __init check_bugs(void)
l1tf_select_mitigation();
mds_select_mitigation();
taa_select_mitigation();
+ srbds_select_mitigation();
/*
* As MDS and TAA mitigations are inter-related, print MDS
@@ -391,6 +393,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting
+ * TSX that are only exposed to SRBDS when TSX is enabled.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (cpu_mitigations_off() || srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ update_srbds_msr();
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -488,7 +581,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
-static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
#ifdef CONFIG_RETPOLINE
@@ -540,10 +635,11 @@ enum spectre_v2_user_cmd {
};
static const char * const spectre_v2_user_strings[] = {
- [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
- [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
- [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
- [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
};
static const struct {
@@ -637,11 +733,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ spectre_v2_user_ibpb = mode;
switch (cmd) {
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
static_branch_enable(&switch_mm_always_ibpb);
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_AUTO:
@@ -657,21 +755,32 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
"always-on" : "conditional");
}
- /* If enhanced IBRS is enabled no STIPB required */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ /*
+ * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * required.
+ */
+ if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
- * If SMT is not possible or STIBP is not available clear the STIPB
- * mode.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
*/
- if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ if (mode != SPECTRE_V2_USER_STRICT &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ /*
+ * If STIBP is not available, clear the STIBP mode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
mode = SPECTRE_V2_USER_NONE;
+
+ spectre_v2_user_stibp = mode;
+
set_mode:
- spectre_v2_user = mode;
- /* Only print the STIBP mode when SMT possible */
- if (smt_possible)
- pr_info("%s\n", spectre_v2_user_strings[mode]);
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
}
static const char * const spectre_v2_strings[] = {
@@ -902,10 +1011,11 @@ void arch_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
update_stibp_strict();
break;
case SPECTRE_V2_USER_PRCTL:
@@ -1130,18 +1240,41 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
return 0;
}
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
+
/*
- * Indirect branch speculation is always disabled in strict
- * mode.
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
*/
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ if (!is_spec_ib_user_controlled() ||
+ task_spec_ib_force_disable(task))
return -EPERM;
+
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
break;
@@ -1151,10 +1284,13 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
* Indirect branch speculation is always allowed when
* mitigation is force disabled.
*/
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+
+ if (!is_spec_ib_user_controlled())
return 0;
+
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
@@ -1184,7 +1320,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
- if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -1213,21 +1350,21 @@ static int ib_prctl_get(struct task_struct *task)
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return PR_SPEC_NOT_AFFECTED;
- switch (spectre_v2_user) {
- case SPECTRE_V2_USER_NONE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_PRCTL:
- case SPECTRE_V2_USER_SECCOMP:
+ else if (is_spec_ib_user_controlled()) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_STRICT:
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
return PR_SPEC_DISABLE;
- default:
+ else
return PR_SPEC_NOT_AFFECTED;
- }
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
@@ -1466,11 +1603,13 @@ static char *stibp_state(void)
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return "";
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
return ", STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
return ", STIBP: forced";
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ return ", STIBP: always-on";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
@@ -1491,6 +1630,11 @@ static char *ibpb_state(void)
return "";
}
+static ssize_t srbds_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -1532,6 +1676,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_ITLB_MULTIHIT:
return itlb_multihit_show_state(buf);
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
default:
break;
}
@@ -1578,4 +1725,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr
{
return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7b4141889919..64066a2497e4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -854,6 +854,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
#endif
+ c->x86_cache_bits = c->x86_phys_bits;
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
@@ -894,7 +895,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
}
}
#endif
- c->x86_cache_bits = c->x86_phys_bits;
}
#define NO_SPECULATION BIT(0)
@@ -964,9 +964,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
{}
};
-static bool __init cpu_matches(unsigned long which)
+#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
+ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, steppings, \
+ X86_FEATURE_ANY, issues)
+
+#define SRBDS BIT(0)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0xC), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0xD), SRBDS),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
{
- const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
+ const struct x86_cpu_id *m = x86_match_cpu(table);
return m && !!(m->driver_data & which);
}
@@ -986,29 +1007,32 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
u64 ia32_cap = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
- if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
- if (cpu_matches(NO_SPECULATION))
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
- if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
- if (cpu_matches(MSBDS_ONLY))
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
}
- if (!cpu_matches(NO_SWAPGS))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
/*
@@ -1026,7 +1050,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
- if (cpu_matches(NO_MELTDOWN))
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -1035,7 +1068,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
- if (cpu_matches(NO_L1TF))
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
@@ -1451,6 +1484,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
}
static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index db10a63687d3..432058e5e44b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -69,6 +69,7 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 3f731d7f04bf..07742b69d914 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = {
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L2DATA] =
+ {
+ .rid = RDT_RESOURCE_L2DATA,
+ .name = "L2DATA",
+ .domains = domain_init(RDT_RESOURCE_L2DATA),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2CODE] =
+ {
+ .rid = RDT_RESOURCE_L2CODE,
+ .name = "L2CODE",
+ .domains = domain_init(RDT_RESOURCE_L2CODE),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
[RDT_RESOURCE_MBA] =
{
.rid = RDT_RESOURCE_MBA,
@@ -259,15 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
r->alloc_enabled = true;
}
-static void rdt_get_cdp_l3_config(int type)
+static void rdt_get_cdp_config(int level, int type)
{
- struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r_l = &rdt_resources_all[level];
struct rdt_resource *r = &rdt_resources_all[type];
- r->num_closid = r_l3->num_closid / 2;
- r->cache.cbm_len = r_l3->cache.cbm_len;
- r->default_ctrl = r_l3->default_ctrl;
- r->cache.shareable_bits = r_l3->cache.shareable_bits;
+ r->num_closid = r_l->num_closid / 2;
+ r->cache.cbm_len = r_l->cache.cbm_len;
+ r->default_ctrl = r_l->default_ctrl;
+ r->cache.shareable_bits = r_l->cache.shareable_bits;
r->data_width = (r->cache.cbm_len + 3) / 4;
r->alloc_capable = true;
/*
@@ -277,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type)
r->alloc_enabled = false;
}
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
+}
+
static int get_cache_id(int cpu, int level)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
@@ -486,6 +532,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
d->id = id;
cpumask_set_cpu(cpu, &d->cpu_mask);
+ rdt_domain_reconfigure_cdp(r);
+
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
return;
@@ -729,15 +777,15 @@ static __init bool get_rdt_alloc_resources(void)
if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
- }
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
ret = true;
}
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
ret = true;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index a43a72d8e88e..b43a786ec15f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -7,12 +7,15 @@
#include <linux/jump_label.h>
#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L2_QOS_CFG 0xc82
#define IA32_L3_CBM_BASE 0xc90
#define IA32_L2_CBM_BASE 0xd10
#define IA32_MBA_THRTL_BASE 0xd50
#define L3_QOS_CDP_ENABLE 0x01ULL
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
/*
* Event IDs are used to program IA32_QM_EVTSEL before reading event
* counter from IA32_QM_CTR
@@ -354,6 +357,8 @@ enum {
RDT_RESOURCE_L3DATA,
RDT_RESOURCE_L3CODE,
RDT_RESOURCE_L2,
+ RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE,
RDT_RESOURCE_MBA,
/* Must be the last */
@@ -437,5 +442,6 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 30827510094b..1c4be5dcd474 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -225,15 +225,14 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
{
u64 chunks, shift, tval;
struct mbm_state *m;
tval = __rmid_read(rmid, rr->evtid);
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
- rr->val = tval;
- return -EINVAL;
+ return tval;
}
switch (rr->evtid) {
case QOS_L3_OCCUP_EVENT_ID:
@@ -247,10 +246,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
break;
default:
/*
- * Code would never reach here because
- * an invalid event id would fail the __rmid_read.
+ * Code would never reach here because an invalid
+ * event id would fail the __rmid_read.
*/
- return -EINVAL;
+ return RMID_VAL_ERROR;
}
if (rr->first) {
@@ -278,23 +277,29 @@ void mon_event_count(void *info)
struct rdtgroup *rdtgrp, *entry;
struct rmid_read *rr = info;
struct list_head *head;
+ u64 ret_val;
rdtgrp = rr->rgrp;
- if (__mon_event_count(rdtgrp->mon.rmid, rr))
- return;
+ ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
/*
- * For Ctrl groups read data from child monitor groups.
+ * For Ctrl groups read data from child monitor groups and
+ * add them together. Count events which are read successfully.
+ * Discard the rmid_read's reporting errors.
*/
head = &rdtgrp->mon.crdtgrp_list;
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr))
- return;
+ if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ ret_val = 0;
}
}
+
+ /* Report error if none of rmid_reads are successful */
+ if (ret_val)
+ rr->val = ret_val;
}
static void mbm_update(struct rdt_domain *d, int rmid)
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 0ec30b2384c0..10d8f5cbb510 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -393,82 +393,86 @@ unlock:
return ret ?: nbytes;
}
-struct task_move_callback {
- struct callback_head work;
- struct rdtgroup *rdtgrp;
-};
-
-static void move_myself(struct callback_head *head)
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
{
- struct task_move_callback *callback;
- struct rdtgroup *rdtgrp;
-
- callback = container_of(head, struct task_move_callback, work);
- rdtgrp = callback->rdtgrp;
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+static void _update_task_closid_rmid(void *task)
+{
/*
- * If resource group was deleted before this task work callback
- * was invoked, then assign the task to root group and free the
- * resource group.
+ * If the task is still current on this CPU, update PQR_ASSOC MSR.
+ * Otherwise, the MSR is updated when the task is scheduled in.
*/
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- current->closid = 0;
- current->rmid = 0;
- kfree(rdtgrp);
- }
-
- preempt_disable();
- /* update PQR_ASSOC MSR to make resource group go into effect */
- intel_rdt_sched_in();
- preempt_enable();
+ if (task == current)
+ intel_rdt_sched_in();
+}
- kfree(callback);
+static void update_task_closid_rmid(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
+ smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
+ else
+ _update_task_closid_rmid(t);
}
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
- struct task_move_callback *callback;
- int ret;
-
- callback = kzalloc(sizeof(*callback), GFP_KERNEL);
- if (!callback)
- return -ENOMEM;
- callback->work.func = move_myself;
- callback->rdtgrp = rdtgrp;
+ /* If the task is already in rdtgrp, no need to move the task. */
+ if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
+ tsk->rmid == rdtgrp->mon.rmid) ||
+ (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
+ tsk->closid == rdtgrp->mon.parent->closid))
+ return 0;
/*
- * Take a refcount, so rdtgrp cannot be freed before the
- * callback has been invoked.
+ * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+ * updated by them.
+ *
+ * For ctrl_mon groups, move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
*/
- atomic_inc(&rdtgrp->waitcount);
- ret = task_work_add(tsk, &callback->work, true);
- if (ret) {
- /*
- * Task is exiting. Drop the refcount and free the callback.
- * No need to check the refcount as the group cannot be
- * deleted before the write function unlocks rdtgroup_mutex.
- */
- atomic_dec(&rdtgrp->waitcount);
- kfree(callback);
- } else {
- /*
- * For ctrl_mon groups move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid)
tsk->rmid = rdtgrp->mon.rmid;
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid)
- tsk->rmid = rdtgrp->mon.rmid;
- else
- ret = -EINVAL;
- }
+ else
+ return -EINVAL;
}
- return ret;
+
+ /*
+ * Ensure the task's closid and rmid are written before determining if
+ * the task is current that will decide if it will be interrupted.
+ */
+ barrier();
+
+ /*
+ * By now, the task's closid and rmid are set. If the task is current
+ * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+ * group go into effect. If the task is not current, the MSR will be
+ * updated when the task is scheduled in.
+ */
+ update_task_closid_rmid(tsk);
+
+ return 0;
}
static int rdtgroup_task_write_permission(struct task_struct *task,
@@ -830,7 +834,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
if (IS_ERR(kn_subdir))
return PTR_ERR(kn_subdir);
- kernfs_get(kn_subdir);
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
return ret;
@@ -853,7 +856,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- kernfs_get(kn_info);
for_each_alloc_enabled_rdt_resource(r) {
fflags = r->fflags | RF_CTRL_INFO;
@@ -870,12 +872,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
goto out_destroy;
}
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn_info);
-
ret = rdtgroup_kn_set_ugid(kn_info);
if (ret)
goto out_destroy;
@@ -904,12 +900,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
if (dest_kn)
*dest_kn = kn;
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -922,6 +912,7 @@ out_destroy:
kernfs_remove(kn);
return ret;
}
+
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
@@ -929,8 +920,17 @@ static void l3_qos_cfg_update(void *arg)
wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
}
-static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+static void l2_qos_cfg_update(void *arg)
{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
int cpu;
@@ -938,16 +938,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
- list_for_each_entry(d, &r->domains, list) {
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ r_l = &rdt_resources_all[level];
+ list_for_each_entry(d, &r_l->domains, list) {
/* Pick one CPU from each domain instance to update MSR */
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
cpu = get_cpu();
/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
- l3_qos_cfg_update(&enable);
+ update(&enable);
/* Update QOS_CFG MSR on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+ smp_call_function_many(cpu_mask, update, &enable, 1);
put_cpu();
free_cpumask_var(cpu_mask);
@@ -955,52 +963,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
return 0;
}
-static int cdp_enable(void)
+static int cdp_enable(int level, int data_type, int code_type)
{
- struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
- struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
- struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
+ struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
+ struct rdt_resource *r_l = &rdt_resources_all[level];
int ret;
- if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
- !r_l3code->alloc_capable)
+ if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
+ !r_lcode->alloc_capable)
return -EINVAL;
- ret = set_l3_qos_cfg(r_l3, true);
+ ret = set_cache_qos_cfg(level, true);
if (!ret) {
- r_l3->alloc_enabled = false;
- r_l3data->alloc_enabled = true;
- r_l3code->alloc_enabled = true;
+ r_l->alloc_enabled = false;
+ r_ldata->alloc_enabled = true;
+ r_lcode->alloc_enabled = true;
}
return ret;
}
-static void cdp_disable(void)
+static int cdpl3_enable(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE);
+}
+
+static int cdpl2_enable(void)
+{
+ return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable(int level, int data_type, int code_type)
+{
+ struct rdt_resource *r = &rdt_resources_all[level];
r->alloc_enabled = r->alloc_capable;
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
- rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
- rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
- set_l3_qos_cfg(r, false);
+ if (rdt_resources_all[data_type].alloc_enabled) {
+ rdt_resources_all[data_type].alloc_enabled = false;
+ rdt_resources_all[code_type].alloc_enabled = false;
+ set_cache_qos_cfg(level, false);
}
}
+static void cdpl3_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
+}
+
+static void cdpl2_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable_all(void)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ cdpl3_disable();
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ cdpl2_disable();
+}
+
static int parse_rdtgroupfs_options(char *data)
{
char *token, *o = data;
int ret = 0;
while ((token = strsep(&o, ",")) != NULL) {
- if (!*token)
- return -EINVAL;
+ if (!*token) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!strcmp(token, "cdp"))
- ret = cdp_enable();
+ if (!strcmp(token, "cdp")) {
+ ret = cdpl3_enable();
+ if (ret)
+ goto out;
+ } else if (!strcmp(token, "cdpl2")) {
+ ret = cdpl2_enable();
+ if (ret)
+ goto out;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
}
+ return 0;
+
+out:
+ pr_err("Invalid mount option \"%s\"\n", token);
+
return ret;
}
@@ -1061,8 +1116,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
if (atomic_dec_and_test(&rdtgrp->waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
kernfs_unbreak_active_protection(kn);
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
} else {
kernfs_unbreak_active_protection(kn);
}
@@ -1113,7 +1167,6 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = ERR_PTR(ret);
goto out_info;
}
- kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
@@ -1121,7 +1174,6 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = ERR_PTR(ret);
goto out_mongrp;
}
- kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
@@ -1155,7 +1207,7 @@ out_mongrp:
out_info:
kernfs_remove(kn_info);
out_cdp:
- cdp_disable();
+ cdp_disable_all();
out:
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
@@ -1264,7 +1316,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
if (atomic_read(&sentry->waitcount) != 0)
sentry->flags = RDT_DELETED;
else
- kfree(sentry);
+ rdtgroup_remove(sentry);
}
}
@@ -1302,7 +1354,7 @@ static void rmdir_all_sub(void)
if (atomic_read(&rdtgrp->waitcount) != 0)
rdtgrp->flags = RDT_DELETED;
else
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -1322,7 +1374,7 @@ static void rdt_kill_sb(struct super_block *sb)
/*Put everything back to default values. */
for_each_alloc_enabled_rdt_resource(r)
reset_all_ctrls(r);
- cdp_disable();
+ cdp_disable_all();
rmdir_all_sub();
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
@@ -1396,11 +1448,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
if (IS_ERR(kn))
return PTR_ERR(kn);
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that kn is always accessible.
- */
- kernfs_get(kn);
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -1561,8 +1608,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
/*
* kernfs_remove() will drop the reference count on "kn" which
* will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn} call below. Take one extra reference
- * here, which will be dropped inside rdtgroup_kn_unlock().
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
*/
kernfs_get(kn);
@@ -1596,6 +1643,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
out_idfree:
free_rmid(rdtgrp->mon.rmid);
out_destroy:
+ kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
out_free_rgrp:
kfree(rdtgrp);
@@ -1608,7 +1656,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
free_rmid(rgrp->mon.rmid);
- kfree(rgrp);
+ rdtgroup_remove(rgrp);
}
/*
@@ -1692,6 +1740,19 @@ out_unlock:
return ret;
}
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ if (!r->alloc_capable)
+ return;
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
+ l2_qos_cfg_update(&r->alloc_enabled);
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
+ l3_qos_cfg_update(&r->alloc_enabled);
+}
+
/*
* We allow creating mon groups only with in a directory called "mon_groups"
* which is present in every ctrl_mon group. Check if this is a valid
@@ -1761,11 +1822,6 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
list_del(&rdtgrp->mon.crdtgrp_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
@@ -1802,11 +1858,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
list_del(&rdtgrp->rdtgroup_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
/*
@@ -1840,7 +1891,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
* If the rdtgroup is a mon group and parent directory
* is a valid "mon_groups" directory, remove the mon group.
*/
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+ rdtgrp != &rdtgroup_default)
ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
else if (rdtgrp->type == RDTMON_GROUP &&
is_mon_groups(parent_kn, kn->name))
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 3fed38812eea..751e59057466 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -34,13 +34,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ for (m = match;
+ m->vendor | m->family | m->model | m->steppings | m->feature;
+ m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
return m;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e57b59762f9f..94aa91b09c28 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -518,7 +518,7 @@ static void do_inject(void)
*/
if (inj_type == DFR_INT_INJ) {
i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ i_mce.status &= ~MCI_STATUS_UC;
}
/*
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 93c22e7ee424..51583a5d656d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -627,16 +627,16 @@ static ssize_t reload_store(struct device *dev,
if (val != 1)
return size;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
- if (tmp_ret != UCODE_NEW)
- return size;
-
get_online_cpus();
ret = check_online_cpus();
if (ret)
goto put;
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ goto put;
+
mutex_lock(&microcode_mutex);
ret = microcode_reload_late();
mutex_unlock(&microcode_mutex);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 16936a24795c..3aa0e5a45303 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -103,53 +103,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
return find_matching_signature(mc, csig, cpf);
}
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- *
- * %true - if there's a match
- * %false - otherwise
- */
-static bool microcode_matches(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- struct extended_sigtable *ext_header;
- unsigned int fam_ucode, model_ucode;
- struct extended_signature *ext_sig;
- unsigned int fam, model;
- int ext_sigcount, i;
-
- fam = x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return false;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- ext_sig++;
- }
- return false;
-}
-
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
@@ -167,7 +120,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size)
return p;
}
-static void save_microcode_patch(void *data, unsigned int size)
+static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
struct ucode_patch *iter, *tmp, *p = NULL;
@@ -213,6 +166,9 @@ static void save_microcode_patch(void *data, unsigned int size)
if (!p)
return;
+ if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
@@ -347,13 +303,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
size -= mc_size;
- if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ if (!find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
if (save) {
- save_microcode_patch(data, mc_size);
+ save_microcode_patch(uci, data, mc_size);
goto next;
}
@@ -486,14 +443,14 @@ static void show_saved_mc(void)
* Save this microcode patch. It will be loaded early when a CPU is
* hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc, unsigned int size)
+static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
{
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
mutex_lock(&x86_cpu_microcode_mutex);
- save_microcode_patch(mc, size);
+ save_microcode_patch(uci, mc, size);
show_saved_mc();
mutex_unlock(&x86_cpu_microcode_mutex);
@@ -937,7 +894,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc, new_mc_size);
+ save_mc_for_early(uci, new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c0201b11e9e2..a6b323a3a630 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -178,8 +178,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints);
+ pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e12ee86906c6..9436f3452049 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -166,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
*repeat = 0;
*uniform = 1;
- /* Make end inclusive instead of exclusive */
- end--;
-
prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state, inclusive;
@@ -260,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
int repeat;
u64 partial_end;
+ /* Make end inclusive instead of exclusive */
+ end--;
+
if (!mtrr_state_set)
return MTRR_TYPE_INVALID;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index ed7ce5184a77..0b9c7150cb23 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..ce5f8e25f70d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -23,6 +23,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/overflow.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -565,7 +566,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(sizeof(struct crash_mem));
+ cmem = vzalloc(struct_size(cmem, ranges, 1));
if (!cmem)
return -ENOMEM;
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6abd83572b01..9692ccc583bb 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,9 +249,9 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- char arg[32];
+ char arg[128];
char *argptr = arg;
- int bit;
+ int arglen, res, bit;
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -271,12 +271,26 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
- sizeof(arg)) &&
- get_option(&argptr, &bit) &&
- bit >= 0 &&
- bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen <= 0)
+ return;
+
+ pr_info("Clearing CPUID bits:");
+ do {
+ res = get_option(&argptr, &bit);
+ if (res == 0 || res == 3)
+ break;
+
+ /* If the argument was too long, the last bit may be cut off */
+ if (res == 1 && arglen >= sizeof(arg))
+ break;
+
+ if (bit >= 0 && bit < NCAPINTS * 32) {
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ setup_clear_cpu_cap(bit);
+ }
+ } while (res == 2);
+ pr_cont("\n");
}
/*
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bc02f5144b95..621d249ded0b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -128,7 +128,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* A whole standard-format XSAVE buffer is needed:
*/
- if ((pos != 0) || (count < fpu_user_xstate_size))
+ if (pos != 0 || count != fpu_user_xstate_size)
return -EFAULT;
xsave = &fpu->state.xsave;
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d99a8ee9e185..86a231338bbf 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -272,6 +272,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
int state_size = fpu_kernel_xstate_size;
u64 xfeatures = 0;
int fx_only = 0;
+ int ret = 0;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -281,15 +282,21 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
return 0;
}
- if (!access_ok(VERIFY_READ, buf, size))
- return -EACCES;
+ if (!access_ok(VERIFY_READ, buf, size)) {
+ ret = -EACCES;
+ goto out_err;
+ }
fpu__initialize(fpu);
- if (!static_cpu_has(X86_FEATURE_FPU))
- return fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, buf) != 0;
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ ret = fpregs_soft_set(current, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ NULL, buf) != 0;
+ if (ret)
+ goto out_err;
+ return 0;
+ }
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
@@ -349,6 +356,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpu__restore(fpu);
local_bh_enable();
+ /* Failure is already handled */
return err;
} else {
/*
@@ -356,13 +364,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
* state to the registers directly (with exceptions handled).
*/
user_fpu_begin();
- if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
- fpu__clear(fpu);
- return -1;
- }
+ if (!copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only))
+ return 0;
+ ret = -1;
}
- return 0;
+out_err:
+ fpu__clear(fpu);
+ return ret;
}
static inline int xstate_sigframe_size(void)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 87a57b7642d3..7d372db8bee1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -405,12 +405,32 @@ static void __init print_xstate_offset_size(void)
}
/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -429,10 +449,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -907,8 +939,6 @@ const void *get_xsave_field_ptr(int xsave_state)
#ifdef CONFIG_ARCH_HAS_PKEYS
-#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
-#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
* This will go out and modify PKRU register to set the access
* rights for @pkey to @init_val.
@@ -927,6 +957,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
+ /*
+ * This code should only be called with valid 'pkey'
+ * values originating from in-kernel users. Complain
+ * if a bad value is observed.
+ */
+ WARN_ON_ONCE(pkey >= arch_max_pkey());
+
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
new_pkru_bits |= PKRU_AD_BIT;
@@ -964,18 +1001,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
return true;
}
-/*
- * This is similar to user_regset_copyout(), but will not add offset to
- * the source data pointer or increment pos, count, kbuf, and ubuf.
- */
-static inline void
-__copy_xstate_to_kernel(void *kbuf, const void *data,
- unsigned int offset, unsigned int size, unsigned int size_total)
+static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
{
- if (offset < size_total) {
- unsigned int copy = min(size, size_total - offset);
+ if (*pos < to) {
+ unsigned size = to - *pos;
+
+ if (size > *count)
+ size = *count;
+ memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
+ }
+}
- memcpy(kbuf + offset, data, copy);
+static void copy_part(unsigned offset, unsigned size, void *from,
+ void **kbuf, unsigned *pos, unsigned *count)
+{
+ fill_gap(offset, kbuf, pos, count);
+ if (size > *count)
+ size = *count;
+ if (size) {
+ memcpy(*kbuf, from, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
}
}
@@ -988,8 +1038,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data,
*/
int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
{
- unsigned int offset, size;
struct xstate_header header;
+ const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ unsigned count = size_total;
int i;
/*
@@ -1005,46 +1056,42 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
header.xfeatures = xsave->header.xfeatures;
header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(0, off_mxcsr,
+ &xsave->i387, &kbuf, &offset_start, &count);
+ if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
+ copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
+ &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(offsetof(struct fxregs_state, st_space), 128,
+ &xsave->i387.st_space, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_SSE)
+ copy_part(xstate_offsets[XFEATURE_SSE], 256,
+ &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
+ xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
/*
* Copy xregs_state->header:
*/
- offset = offsetof(struct xregs_state, header);
- size = sizeof(header);
-
- __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
+ copy_part(offsetof(struct xregs_state, header), sizeof(header),
+ &header, &kbuf, &offset_start, &count);
- for (i = 0; i < XFEATURE_MAX; i++) {
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
/*
* Copy only in-use xstates:
*/
if ((header.xfeatures >> i) & 1) {
void *src = __raw_xsave_addr(xsave, 1 << i);
- offset = xstate_offsets[i];
- size = xstate_sizes[i];
-
- /* The next component has to fit fully into the output buffer: */
- if (offset + size > size_total)
- break;
-
- __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
+ copy_part(xstate_offsets[i], xstate_sizes[i],
+ src, &kbuf, &offset_start, &count);
}
}
-
- if (xfeatures_mxcsr_quirk(header.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
- }
-
- /*
- * Fill xsave->i387.sw_reserved value for ptrace frame:
- */
- offset = offsetof(struct fxregs_state, sw_reserved);
- size = sizeof(xstate_fx_sw_bytes);
-
- __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
+ fill_gap(size_total, &kbuf, &offset_start, &count);
return 0;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 02abc134367f..f7833ae4e3f1 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -206,7 +206,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3c2326b59820..fbcc303fb1f9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -274,8 +274,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
- else
+ else {
kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 4d948d87f01c..a65636d60296 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -211,8 +211,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
/* Copying screen_info will do? */
- memcpy(&params->screen_info, &boot_params.screen_info,
- sizeof(struct screen_info));
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
/* Fill in memsize later */
params->screen_info.ext_mem_k = 0;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 9d7bb8de2917..cfd8269ab4cd 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -173,6 +173,8 @@ NOKPROBE_SYMBOL(skip_prefixes);
int can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+ int i;
if (search_exception_tables((unsigned long)addr))
return 0; /* Page fault may occur on this address. */
@@ -185,9 +187,14 @@ int can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return 0;
- /* Can't boost Address-size override prefix */
- if (unlikely(inat_is_address_size_prefix(insn->attr)))
- return 0;
+ for_each_insn_prefix(insn, i, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return 0;
+ }
opcode = insn->opcode.bytes[0];
@@ -212,8 +219,8 @@ int can_boost(struct insn *insn, void *addr)
/* clear and set flags are boostable */
return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
default:
- /* CS override prefix and call are not boostable */
- return (opcode != 0x2e && opcode != 0x9a);
+ /* call is not boostable */
+ return opcode != 0x9a;
}
}
@@ -744,16 +751,11 @@ asm(
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
-static struct kprobe kretprobe_kprobe = {
- .addr = (void *)kretprobe_trampoline,
-};
-
/*
* Called from kretprobe_trampoline
*/
__visible __used void *trampoline_handler(struct pt_regs *regs)
{
- struct kprobe_ctlblk *kcb;
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
@@ -763,16 +765,12 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
void *frame_pointer;
bool skipped = false;
- preempt_disable();
-
/*
* Set a dummy kprobe for avoiding kretprobe recursion.
* Since kretprobe never run in kprobe handler, kprobe must not
* be running at this point.
*/
- kcb = get_kprobe_ctlblk();
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ kprobe_busy_begin();
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -851,7 +849,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
__this_cpu_write(current_kprobe, &ri->rp->kp);
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
+ __this_cpu_write(current_kprobe, &kprobe_busy);
}
recycle_rp_inst(ri, &empty_rp);
@@ -867,8 +865,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
kretprobe_hash_unlock(current, &flags);
- __this_cpu_write(current_kprobe, NULL);
- preempt_enable();
+ kprobe_busy_end();
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
@@ -1032,6 +1029,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* So clear it by resetting the current kprobe:
*/
regs->flags &= ~X86_EFLAGS_TF;
+ /*
+ * Since the single step (trap) has been cancelled,
+ * we need to restore BTF here.
+ */
+ restore_btf();
/*
* If the TF flag was set before the kprobe hit,
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f58336af095c..1ccfe6bb9122 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -126,6 +126,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
+ case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d2ef967bfafb..a07b09f68e7e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -414,28 +414,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
u64 msr = x86_spec_ctrl_base;
bool updmsr = false;
- /*
- * If TIF_SSBD is different, select the proper mitigation
- * method. Note that if SSBD mitigation is disabled or permanentely
- * enabled this branch can't be taken because nothing can set
- * TIF_SSBD.
- */
- if (tif_diff & _TIF_SSBD) {
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_ssb_virt_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_core_ssb_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD)) {
- msr |= ssbd_tif_to_spec_ctrl(tifn);
- updmsr = true;
- }
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
}
- /*
- * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
- * otherwise avoid the MSR write.
- */
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
if (IS_ENABLED(CONFIG_SMP) &&
static_branch_unlikely(&switch_to_cond_stibp)) {
updmsr |= !!(tif_diff & _TIF_SPEC_IB);
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 898e97cf6629..320ab978fb1f 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -19,7 +19,7 @@ static inline void switch_to_extra(struct task_struct *prev,
if (IS_ENABLED(CONFIG_SMP)) {
/*
* Avoid __switch_to_xtra() invocation when conditional
- * STIPB is disabled and the only different bit is
+ * STIBP is disabled and the only different bit is
* TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
* in the TIF_WORK_CTXSW masks.
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 734549492a18..dc4d27000aa3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -374,7 +374,7 @@ static unsigned long task_seg_base(struct task_struct *task,
*/
mutex_lock(&task->mm->context.lock);
ldt = task->mm->context.ldt;
- if (unlikely(idx >= ldt->nr_entries))
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
base = 0;
else
base = get_desc_base(ldt->entries + idx);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c663d5fcff2e..aa08b7614ab0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
},
},
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
{ /* Handle problems with rebooting on Apple MacBookPro5 */
.callback = set_pci_reboot,
.ident = "Apple MacBookPro5",
@@ -380,10 +388,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
{ /* Handle problems with rebooting on the OptiPlex 990. */
.callback = set_pci_reboot,
- .ident = "Dell OptiPlex 990",
+ .ident = "Dell OptiPlex 990 BIOS A0x",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A0"),
},
},
{ /* Handle problems with rebooting on Dell 300's */
@@ -469,6 +478,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
@@ -530,29 +548,20 @@ static void emergency_vmx_disable_all(void)
local_irq_disable();
/*
- * We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignore INIT
- * signals when VMX is enabled.
- *
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+ * the machine, because the CPU blocks INIT when it's in VMX root.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
*
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+ * doesn't prevent a different CPU from being in VMX root operation.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU. */
- cpu_vmxoff();
+ if (cpu_has_vmx()) {
+ /* Safely force _this_ CPU out of VMX root operation. */
+ __cpu_emergency_vmxoff();
- /* Halt and disable VMX on the other CPUs */
+ /* Halt and exit VMX root operation on the other CPUs. */
nmi_shootdown_cpus(vmxoff_nmi);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4bc12447a50f..c8dd6553634c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1211,6 +1211,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
vsmp_init();
@@ -1218,11 +1220,6 @@ void __init setup_arch(char **cmdline_p)
early_platform_quirks();
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-
early_acpi_boot_init();
initmem_init();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 01741834fd6a..a18632b4da83 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -769,30 +769,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
- /*
- * This function is fundamentally broken as currently
- * implemented.
- *
- * The idea is that we want to trigger a call to the
- * restart_block() syscall and that we want in_ia32_syscall(),
- * in_x32_syscall(), etc. to match whatever they were in the
- * syscall being restarted. We assume that the syscall
- * instruction at (regs->ip - 2) matches whatever syscall
- * instruction we used to enter in the first place.
- *
- * The problem is that we can get here when ptrace pokes
- * syscall-like values into regs even if we're not in a syscall
- * at all.
- *
- * For now, we maintain historical behavior and guess based on
- * stored state. We could do better by saving the actual
- * syscall arch in restart_block or (with caveats on x32) by
- * checking if regs->ip points to 'int $0x80'. The current
- * behavior is incorrect if a tracer has a different bitness
- * than the tracee.
- */
#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (current_thread_info()->status & TS_COMPAT_RESTART)
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 30447d210f37..66f2a950935a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -270,6 +270,14 @@ static void notrace start_secondary(void *unused)
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+
+ /*
+ * Prevent tail call to cpu_startup_entry() because the stack protector
+ * guard has been changed a couple of function calls up, in
+ * boot_init_stack_canary() and must not be checked before tail calling
+ * another function.
+ */
+ prevent_tail_call_optimization();
}
/**
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index ab0176ae985b..12f90f17f4f6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -24,10 +24,6 @@
#include <asm/hpet.h>
#include <asm/time.h>
-#ifdef CONFIG_X86_64
-__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-#endif
-
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 3bbb399f7ead..e64c5b78fbfd 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -90,9 +90,6 @@ static struct orc_entry null_orc_entry = {
static struct orc_entry *orc_find(unsigned long ip)
{
- if (!orc_init)
- return NULL;
-
if (ip == 0)
return &null_orc_entry;
@@ -342,8 +339,11 @@ bool unwind_next_frame(struct unwind_state *state)
/*
* Find the orc_entry associated with the text address.
*
- * Decrement call return addresses by one so they work for sibling
- * calls and calls to noreturn functions.
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
@@ -460,7 +460,7 @@ bool unwind_next_frame(struct unwind_state *state)
default:
orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
orc->type, (void *)orig_ip);
- break;
+ goto done;
}
/* Find BP: */
@@ -511,17 +511,20 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
memset(state, 0, sizeof(*state));
state->task = task;
+ if (!orc_init)
+ goto err;
+
/*
* Refuse to unwind the stack of a task while it's executing on another
* CPU. This check is racy, but that's ok: the unwinder has other
* checks to prevent it from going off the rails.
*/
if (task_on_another_cpu(task))
- goto done;
+ goto err;
if (regs) {
if (user_mode(regs))
- goto done;
+ goto the_end;
state->ip = regs->ip;
state->sp = kernel_stack_pointer(regs);
@@ -540,9 +543,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
} else {
struct inactive_task_frame *frame = (void *)task->thread.sp;
- state->sp = task->thread.sp;
+ state->sp = task->thread.sp + sizeof(*frame);
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork;
}
if (get_stack_info((unsigned long *)state->sp, state->task,
@@ -554,6 +558,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ state->error = true;
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
@@ -574,13 +579,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
/* Otherwise, skip ahead to the user-specified starting frame: */
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->sp <= (unsigned long)first_frame))
+ state->sp < (unsigned long)first_frame))
unwind_next_frame(state);
return;
-done:
+err:
+ state->error = true;
+the_end:
state->stack_info.type = STACK_TYPE_UNKNOWN;
- return;
}
EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 73391c1bd2a9..52bb7413f352 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -268,10 +268,11 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
+ insn_byte_t p;
int i;
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- switch (insn->prefixes.bytes[i]) {
+ for_each_insn_prefix(insn, i, p) {
+ switch (p) {
case 0x26: /* INAT_PFX_ES */
case 0x2E: /* INAT_PFX_CS */
case 0x36: /* INAT_PFX_DS */
@@ -711,6 +712,7 @@ static const struct uprobe_xol_ops branch_xol_ops = {
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
+ insn_byte_t p;
int i;
switch (opc1) {
@@ -741,8 +743,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- if (insn->prefixes.bytes[i] == 0x66)
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0x66)
return -ENOTSUPP;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2384a2ae5ec3..d3dc8bc6b3ad 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -36,13 +36,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
-jiffies = jiffies_64;
#else
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
#endif
+jiffies = jiffies_64;
+
#if defined(CONFIG_X86_64)
/*
* On 64-bit, align RODATA to 2MB so we retain large page mappings for
@@ -352,7 +352,8 @@ SECTIONS
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
*(.bss..page_aligned)
- *(.bss)
+ . = ALIGN(PAGE_SIZE);
+ *(BSS_MAIN)
. = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1152afad524f..7e1ab0e0f3f2 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -494,7 +494,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->edx |= F(SPEC_CTRL);
if (boot_cpu_has(X86_FEATURE_STIBP))
entry->edx |= F(INTEL_STIBP);
- if (boot_cpu_has(X86_FEATURE_SSBD))
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->edx |= F(SPEC_CTRL_SSBD);
/*
* We emulate ARCH_CAPABILITIES in software even
@@ -648,8 +649,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
unsigned phys_as = entry->eax & 0xff;
- if (!g_phys_as)
+ /*
+ * Use bare metal's MAXPHADDR if the CPU doesn't report guest
+ * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the
+ * guest version "applies only to guests using nested paging".
+ */
+ if (!g_phys_as || !tdp_enabled)
g_phys_as = phys_as;
+
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4cc8a4a6f1d0..23d3329e1c73 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3544,7 +3544,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
u64 tsc_aux = 0;
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
- return emulate_gp(ctxt, 0);
+ return emulate_ud(ctxt);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -3949,6 +3949,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
static int em_movsxd(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = (s32) ctxt->src.val;
@@ -4463,7 +4469,7 @@ static const struct opcode group11[] = {
};
static const struct gprefix pfx_0f_ae_7 = {
- I(SrcMem | ByteOp, em_clflush), N, N, N,
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
};
static const struct group_dual group15 = { {
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f500293dad8d..ce4a9f1f845e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -5,7 +5,7 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 537c36b55b5d..d4fdf0e52144 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1918,7 +1918,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e5af08b58132..68ed9c01ca3c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -263,6 +263,11 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
*/
static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+static u8 __read_mostly shadow_phys_bits;
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -275,11 +280,18 @@ static bool is_executable_pte(u64 spte);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
+}
+
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
return sp->role.ad_disabled;
@@ -287,7 +299,7 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
return !(spte & shadow_acc_track_value);
}
@@ -298,13 +310,13 @@ static bool is_nx_huge_page_enabled(void)
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
@@ -374,11 +386,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mmu_spte_set(sptep, mask);
}
-static bool is_mmio_spte(u64 spte)
-{
- return (spte & shadow_mmio_mask) == shadow_mmio_value;
-}
-
static gfn_t get_mmio_spte_gfn(u64 spte)
{
u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
@@ -443,6 +450,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+static u8 kvm_get_shadow_phys_bits(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+ * in CPU detection code, but MKTME treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore for MKTME
+ * we should still return physical address bits reported by CPUID.
+ */
+ if (!boot_cpu_has(X86_FEATURE_TME) ||
+ WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+ return boot_cpu_data.x86_phys_bits;
+
+ return cpuid_eax(0x80000008) & 0xff;
+}
+
static void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
@@ -456,20 +478,29 @@ static void kvm_mmu_reset_all_pte_masks(void)
shadow_present_mask = 0;
shadow_acc_track_mask = 0;
+ shadow_phys_bits = kvm_get_shadow_phys_bits();
+
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
* assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
*/
+ shadow_nonpresent_or_rsvd_mask = 0;
low_phys_bits = boot_cpu_data.x86_phys_bits;
- if (boot_cpu_data.x86_phys_bits <
- 52 - shadow_nonpresent_or_rsvd_mask_len) {
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - shadow_nonpresent_or_rsvd_mask_len;
shadow_nonpresent_or_rsvd_mask =
- rsvd_bits(boot_cpu_data.x86_phys_bits -
- shadow_nonpresent_or_rsvd_mask_len,
- boot_cpu_data.x86_phys_bits - 1);
- low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
}
+
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
}
@@ -1682,10 +1713,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
* Emulate arch specific page modification logging for the
* nested hypervisor
*/
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
{
if (kvm_x86_ops->write_log_dirty)
- return kvm_x86_ops->write_log_dirty(vcpu);
+ return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
return 0;
}
@@ -4213,7 +4244,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | gbpages_bit_rsvd |
+ gbpages_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
@@ -4295,7 +4326,16 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
- bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = context->nx || !tdp_enabled ||
+ context->base_role.smep_andnot_wp;
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4305,7 +4345,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
*/
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
context->shadow_root_level, uses_nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), true);
@@ -4342,13 +4382,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
false);
if (!shadow_me_mask)
@@ -4369,7 +4409,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- boot_cpu_data.x86_phys_bits, execonly);
+ shadow_phys_bits, execonly);
}
#define BYTE_MASK(access) \
@@ -5666,6 +5706,25 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
return 0;
}
+static void kvm_set_mmio_spte_mask(void)
+{
+ u64 mask;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
int kvm_mmu_module_init(void)
{
if (nx_huge_pages == -1)
@@ -5673,6 +5732,8 @@ int kvm_mmu_module_init(void)
kvm_mmu_reset_all_pte_masks();
+ kvm_set_mmio_spte_mask();
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
@@ -5794,6 +5855,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 068feab64acf..e76f5963aa7d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -53,7 +53,7 @@ static inline u64 rsvd_bits(int s, int e)
if (e < s)
return 0;
- return ((1ULL << (e - s + 1)) - 1) << s;
+ return ((2ULL << (e - s)) - 1) << s;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
@@ -194,7 +194,7 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 918b0d5bf272..1c1c2649829b 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -339,7 +339,7 @@ TRACE_EVENT(
/* These depend on page entry type, so compute them now. */
__field(bool, r)
__field(bool, x)
- __field(u8, u)
+ __field(signed char, u)
),
TP_fast_assign(
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8cf7a09bdd73..4b921f1a165d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -93,8 +93,8 @@ struct guest_walker {
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
@@ -202,7 +202,7 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
- int write_fault)
+ gpa_t addr, int write_fault)
{
unsigned level, index;
pt_element_t pte, orig_pte;
@@ -227,7 +227,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
#if PTTYPE == PTTYPE_EPT
- if (kvm_arch_write_log_dirty(vcpu))
+ if (kvm_arch_write_log_dirty(vcpu, addr))
return -EINVAL;
#endif
pte |= PT_GUEST_DIRTY_MASK;
@@ -388,13 +388,15 @@ retry_walk:
}
walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
@@ -424,7 +426,8 @@ retry_walk:
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
- ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
if (unlikely(ret < 0))
goto error;
else if (ret)
@@ -432,7 +435,8 @@ retry_walk:
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access, walker->pt_access);
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -601,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
- unsigned direct_access, access = gw->pt_access;
+ unsigned int direct_access, access;
int top_level, ret;
gfn_t gfn, base_gfn;
@@ -633,6 +637,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
false, access);
}
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 84ae4dd261ca..cafdaabf062f 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
/* mapping between fixed pmc index and intel_arch_events array */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d63621386418..5ff6c145fdbb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -389,6 +389,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@ -1208,12 +1211,7 @@ static __init int svm_hardware_setup(void)
}
}
- if (vgif) {
- if (!boot_cpu_has(X86_FEATURE_VGIF))
- vgif = false;
- else
- pr_info("Virtual GIF supported\n");
- }
+ vgif = false; /* Disabled for CVE-2021-3653 */
return 0;
@@ -2757,8 +2755,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+ /* Trap async PF even if not shadowing */
+ if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
return NESTED_EXIT_HOST;
break;
default:
@@ -2847,7 +2845,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
+ /* asid not copied, it is handled manually for svm->vmcb. */
dst->tlb_ctl = from->tlb_ctl;
dst->int_ctl = from->int_ctl;
dst->int_vector = from->int_vector;
@@ -3161,7 +3159,13 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.intercept = nested_vmcb->control.intercept;
svm_flush_tlb(&svm->vcpu, true);
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+
+ svm->vmcb->control.int_ctl &=
+ V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+
+ svm->vmcb->control.int_ctl |= nested_vmcb->control.int_ctl &
+ (V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK);
+
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
else
@@ -3532,7 +3536,7 @@ static int cr_interception(struct vcpu_svm *svm)
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_readl(&svm->vcpu, reg);
switch (cr) {
case 0:
if (!check_selective_cr0_intercepted(svm, val))
@@ -3577,7 +3581,7 @@ static int cr_interception(struct vcpu_svm *svm)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_writel(&svm->vcpu, reg, val);
}
return kvm_complete_insn_gp(&svm->vcpu, err);
}
@@ -3607,13 +3611,13 @@ static int dr_interception(struct vcpu_svm *svm)
if (dr >= 16) { /* mov to DRn */
if (!kvm_require_dr(&svm->vcpu, dr - 16))
return 1;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_readl(&svm->vcpu, reg);
kvm_set_dr(&svm->vcpu, dr - 16, val);
} else {
if (!kvm_require_dr(&svm->vcpu, dr))
return 1;
kvm_get_dr(&svm->vcpu, dr, &val);
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_writel(&svm->vcpu, reg, val);
}
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -4862,6 +4866,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
+ pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85680b86524..396d41f192ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1674,43 +1674,15 @@ static void vmcs_load(struct vmcs *vmcs)
}
#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
-
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -1722,19 +1694,24 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -2497,18 +2474,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
@@ -3800,21 +3776,6 @@ static int hardware_enable(void)
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
-
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -5631,6 +5592,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
+ BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
+
vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
if (enable_ept)
vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
@@ -6198,8 +6161,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return (!to_vmx(vcpu)->nested.nested_run_pending &&
- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return false;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return true;
+
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
@@ -6284,7 +6252,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
*/
static void kvm_machine_check(void)
{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_MCE)
struct pt_regs regs = {
.cs = 3, /* Fake ring 3 no matter what the guest ran on */
.flags = X86_EFLAGS_IF,
@@ -8745,7 +8713,7 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
- switch (exit_reason) {
+ switch ((u16)exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
if (is_nmi(intr_info))
return false;
@@ -9176,6 +9144,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -9314,15 +9283,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- if (!cpu_need_tpr_shadow(vcpu))
- return;
-
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -11659,7 +11629,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
}
}
-static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
@@ -11697,8 +11667,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
return 0;
}
- if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
- nested_exit_on_intr(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
@@ -12254,17 +12223,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- /*
- * TODO: SDM says that with acknowledge interrupt on
- * exit, bit 31 of the VM-exit interrupt information
- * (valid interrupt) is always set to 1 on
- * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
- * need kvm_cpu_has_interrupt(). See the commit
- * message for details.
- */
- if (nested_exit_intr_ack_set(vcpu) &&
- exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- kvm_cpu_has_interrupt(vcpu)) {
+ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -12505,11 +12465,10 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
{
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t gpa;
struct page *page = NULL;
u64 *pml_address;
@@ -12530,7 +12489,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
return 1;
}
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+ gpa &= ~0xFFFull;
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
if (is_error_page(page))
@@ -12940,7 +12899,7 @@ module_exit(vmx_exit)
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
@@ -12962,6 +12921,12 @@ static int __init vmx_init(void)
}
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d6851636edab..0690155f42b2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -98,6 +98,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -399,8 +400,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
@@ -806,6 +805,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
@@ -2344,7 +2345,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
@@ -2354,6 +2355,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -2629,7 +2634,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_APICBASE:
msr_info->data = kvm_get_apic_base(vcpu);
break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
@@ -3073,6 +3078,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
+ if (vcpu->arch.st.steal.preempted)
+ return;
+
vcpu->arch.st.steal.preempted = 1;
kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
@@ -3214,7 +3222,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
- if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
@@ -3285,6 +3293,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
process_nmi(vcpu);
+
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+
/*
* FIXME: pass injected and pending separately. This is only
* needed for nested virtualization, whose state cannot be
@@ -4365,10 +4377,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&u.ps, argp, sizeof u.ps))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit_out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_GET_PIT2: {
@@ -4388,10 +4403,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit2_out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_REINJECT_CONTROL: {
@@ -6291,35 +6309,6 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
- int maxphyaddr = boot_cpu_data.x86_phys_bits;
-
- /*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
- */
-
- /*
- * Mask the uppermost physical address bit, which would be reserved as
- * long as the supported physical address width is less than 52.
- */
- mask = 1ull << 51;
-
- /* Set the present bit. */
- mask |= 1ull;
-
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (maxphyaddr == 52)
- mask &= ~1ull;
-
- kvm_mmu_set_mmio_spte_mask(mask, mask);
-}
-
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
@@ -6397,8 +6386,6 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_set_mmio_spte_mask();
-
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
@@ -6435,6 +6422,7 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
@@ -6638,13 +6626,20 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops->queue_exception(vcpu);
+}
+
+static int inject_pending_event(struct kvm_vcpu *vcpu)
{
int r;
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- kvm_x86_ops->queue_exception(vcpu);
+ kvm_inject_exception(vcpu);
return 0;
}
@@ -6665,7 +6660,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -6689,7 +6684,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_update_dr7(vcpu);
}
- kvm_x86_ops->queue_exception(vcpu);
+ kvm_inject_exception(vcpu);
} else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
@@ -6706,7 +6701,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
* KVM_REQ_EVENT only on certain events and not unconditionally?
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -7152,7 +7147,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
- if (inject_pending_event(vcpu, req_int_win) != 0)
+ if (inject_pending_event(vcpu) != 0)
req_immediate_exit = true;
else {
/* Enable NMI/IRQ window open exits if needed.
@@ -7251,6 +7246,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[3], 3);
set_debugreg(vcpu->arch.dr6, 6);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
}
kvm_x86_ops->run(vcpu);
@@ -7360,7 +7357,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu, false);
+ kvm_x86_ops->check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -8584,6 +8581,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
{
int i;
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
@@ -8657,6 +8661,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
return 0;
}
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 693cce0be82d..47ea8d20af47 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -240,7 +240,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info)
rv->err = wrmsr_safe_regs(rv->regs);
}
-int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
@@ -253,7 +253,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
}
EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
-int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..2c3b4bcbe8f2 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -118,7 +118,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
*/
if (size < 8) {
if (!IS_ALIGNED(dest, 4) || size != 4)
- clean_cache_range(dst, 1);
+ clean_cache_range(dst, size);
} else {
if (!IS_ALIGNED(dest, 8)) {
dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
index f031c0e19356..515cdee90df7 100644
--- a/arch/x86/math-emu/wm_sqrt.S
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -209,7 +209,7 @@ sqrt_stage_2_finish:
#ifdef PARANOID
/* It should be possible to get here only if the arg is ffff....ffff */
- cmp $0xffffffff,FPU_fsqrt_arg_1
+ cmpl $0xffffffff,FPU_fsqrt_arg_1
jnz sqrt_stage_2_error
#endif /* PARANOID */
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ab33a32df2a8..407fa1df470e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -62,6 +62,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
+ int result;
for (; addr < end; addr = next) {
p4d_t *p4d = p4d_page + p4d_index(addr);
@@ -73,13 +74,20 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, 0);
- ident_pud_init(info, pud, addr, next);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
return -ENOMEM;
- ident_pud_init(info, pud, addr, next);
+
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 32bb38f6fc18..8039a951db8f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -112,8 +112,6 @@ __ref void *alloc_low_pages(unsigned int num)
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
- printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
- pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
for (i = 0; i < num; i++) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 624edfbff02d..8c02da40931a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1282,18 +1282,18 @@ int kern_addr_valid(unsigned long addr)
return 0;
p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
+ if (!p4d_present(*p4d))
return 0;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+ if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return pfn_valid(pud_pfn(*pud));
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 7bebdd0273d3..3faf9667cc40 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -626,7 +626,7 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
return arch_memremap_can_ram_remap(phys_addr, size, 0);
}
-#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+#ifdef CONFIG_AMD_MEM_ENCRYPT
/* Remap memory with encryption */
void __init *early_memremap_encrypted(resource_size_t phys_addr,
unsigned long size)
@@ -668,7 +668,7 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
-#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 48c03c74c7f4..f3ec45afce97 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -248,8 +248,8 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
-#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
+ (_PAGE_PAT_LARGE | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 4d434ddb75db..f140b2d39319 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -385,7 +385,7 @@ static void enter_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL &&
+ if (!cpumask_available(downed_cpus) &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
pr_notice("Failed to allocate mask\n");
goto out;
@@ -415,7 +415,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
+ if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 55338b392221..84e5c5bdaa74 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -765,6 +765,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
}
free_page((unsigned long)pmd_sv);
+
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
free_page((unsigned long)pmd);
return 1;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index cdb386fa7101..a114c319cac2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -153,6 +153,19 @@ static bool is_ereg(u32 reg)
BIT(BPF_REG_AX));
}
+/*
+ * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
+ * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
+ * of encoding. al,cl,dl,bl have simpler encoding.
+ */
+static bool is_ereg_8l(u32 reg)
+{
+ return is_ereg(reg) ||
+ (1 << reg) & (BIT(BPF_REG_1) |
+ BIT(BPF_REG_2) |
+ BIT(BPF_REG_FP));
+}
+
/* add modifiers if 'reg' maps to x64 registers r8..r15 */
static u8 add_1mod(u8 byte, u32 reg)
{
@@ -770,9 +783,8 @@ st: if (is_imm8(insn->off))
/* STX: *(u8*)(dst_reg + off) = src_reg */
case BPF_STX | BPF_MEM | BPF_B:
/* emit 'mov byte ptr [rax + off], al' */
- if (is_ereg(dst_reg) || is_ereg(src_reg) ||
- /* have to add extra byte for x86 SIL, DIL regs */
- src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+ if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
+ /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
else
EMIT1(0x88);
@@ -1095,7 +1107,16 @@ common_load:
}
if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 33e9b4f1ce20..c177da94fc79 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -572,6 +572,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar);
/*
* Device [1022:7808]
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index cadd7fd290fa..2a4ad0606fa3 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -276,7 +276,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(new_phys, new_size);
+ new = early_memremap_prot(new_phys, new_size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 03fc397335b7..c9fc725a1dcf 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -171,9 +171,10 @@ static struct irq_domain *uv_get_irq_domain(void)
goto out;
uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL);
- irq_domain_free_fwnode(fn);
if (uv_domain)
uv_domain->parent = x86_vector_domain;
+ else
+ irq_domain_free_fwnode(fn);
out:
mutex_unlock(&uv_lock);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index ed84d3917a59..76740c8c000c 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -57,6 +57,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -114,8 +115,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
}
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
index fd1ab80be0de..a4cf678cf5c8 100644
--- a/arch/x86/tools/chkobjdump.awk
+++ b/arch/x86/tools/chkobjdump.awk
@@ -10,6 +10,7 @@ BEGIN {
/^GNU objdump/ {
verstr = ""
+ gsub(/\(.*\)/, "");
for (i = 3; i <= NF; i++)
if (match($(i), "^[0-9]")) {
verstr = $(i);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 220e97841e49..c58b63178123 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -840,9 +840,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
@@ -883,9 +885,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 1f8175bf2a5e..88d084a57b14 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -721,8 +721,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
preempt_enable();
}
-static void xen_convert_trap_info(const struct desc_ptr *desc,
- struct trap_info *traps)
+static unsigned xen_convert_trap_info(const struct desc_ptr *desc,
+ struct trap_info *traps, bool full)
{
unsigned in, out, count;
@@ -732,17 +732,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
for (in = out = 0; in < count; in++) {
gate_desc *entry = (gate_desc *)(desc->address) + in;
- if (cvt_gate_to_trap(in, entry, &traps[out]))
+ if (cvt_gate_to_trap(in, entry, &traps[out]) || full)
out++;
}
- traps[out].address = 0;
+
+ return out;
}
void xen_copy_trap_info(struct trap_info *traps)
{
const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
- xen_convert_trap_info(desc, traps);
+ xen_convert_trap_info(desc, traps, true);
}
/* Load a new IDT into Xen. In principle this can be per-CPU, so we
@@ -752,6 +753,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ unsigned out;
trace_xen_cpu_load_idt(desc);
@@ -759,7 +761,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
- xen_convert_trap_info(desc, traps);
+ out = xen_convert_trap_info(desc, traps, false);
+ memset(&traps[out], 0, sizeof(traps[0]));
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
@@ -1214,6 +1217,11 @@ static void __init xen_dom0_set_legacy_features(void)
x86_platform.legacy.rtc = 1;
}
+static void __init xen_domu_set_legacy_features(void)
+{
+ x86_platform.legacy.rtc = 0;
+}
+
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
@@ -1375,6 +1383,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
add_preferred_console("hvc", 0, NULL);
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
+ x86_platform.set_legacy_features =
+ xen_domu_set_legacy_features;
} else {
const struct dom0_vga_console_info *info =
(void *)((char *)xen_start_info +
@@ -1404,6 +1414,15 @@ asmlinkage __visible void __init xen_start_kernel(void)
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
xen_boot_params_init_edd();
+
+#ifdef CONFIG_ACPI
+ /*
+ * Disable selecting "Firmware First mode" for correctable
+ * memory errors, as this is the duty of the hypervisor to
+ * decide.
+ */
+ acpi_disable_cmcff = 1;
+#endif
}
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 15812e553b95..0d83f25ac8ac 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -613,8 +613,8 @@ int xen_alloc_p2m_entry(unsigned long pfn)
}
/* Expanded the p2m? */
- if (pfn > xen_p2m_last_pfn) {
- xen_p2m_last_pfn = pfn;
+ if (pfn >= xen_p2m_last_pfn) {
+ xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE);
HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
}
@@ -706,9 +706,12 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
for (i = 0; i < count; i++) {
unsigned long mfn, pfn;
+ struct gnttab_unmap_grant_ref unmap[2];
+ int rc;
/* Do not add to override if the map failed. */
- if (map_ops[i].status)
+ if (map_ops[i].status != GNTST_okay ||
+ (kmap_ops && kmap_ops[i].status != GNTST_okay))
continue;
if (map_ops[i].flags & GNTMAP_contains_pte) {
@@ -722,10 +725,46 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
- if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
- ret = -ENOMEM;
- goto out;
+ if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
+ continue;
+
+ /*
+ * Signal an error for this slot. This in turn requires
+ * immediate unmapping.
+ */
+ map_ops[i].status = GNTST_general_error;
+ unmap[0].host_addr = map_ops[i].host_addr,
+ unmap[0].handle = map_ops[i].handle;
+ map_ops[i].handle = ~0;
+ if (map_ops[i].flags & GNTMAP_device_map)
+ unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr;
+ else
+ unmap[0].dev_bus_addr = 0;
+
+ if (kmap_ops) {
+ kmap_ops[i].status = GNTST_general_error;
+ unmap[1].host_addr = kmap_ops[i].host_addr,
+ unmap[1].handle = kmap_ops[i].handle;
+ kmap_ops[i].handle = ~0;
+ if (kmap_ops[i].flags & GNTMAP_device_map)
+ unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr;
+ else
+ unmap[1].dev_bus_addr = 0;
}
+
+ /*
+ * Pre-populate both status fields, to be recognizable in
+ * the log message below.
+ */
+ unmap[0].status = 1;
+ unmap[1].status = 1;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, 1 + !!kmap_ops);
+ if (rc || unmap[0].status != GNTST_okay ||
+ unmap[1].status != GNTST_okay)
+ pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n",
+ rc, unmap[0].status, unmap[1].status);
}
out:
@@ -746,17 +785,15 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT))
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ else
ret = -EINVAL;
- goto out;
- }
-
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
if (kunmap_ops)
ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
- kunmap_ops, count);
-out:
+ kunmap_ops, count) ?: ret;
+
return ret;
}
EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index db6d90e451de..f779d2a5b04c 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -89,6 +89,7 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+ prevent_tail_call_optimization();
}
void xen_smp_intr_free_pv(unsigned int cpu)
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 2527540051ff..e22ee2439615 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -99,10 +99,20 @@ void xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
+ int irq;
+
if (!xen_pvspin)
return;
- unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ /*
+ * When booting the kernel with 'mitigations=auto,nosmt', the secondary
+ * CPUs are not activated, and lock_kicker_irq is not initialized.
+ */
+ irq = per_cpu(lock_kicker_irq, cpu);
+ if (irq == -1)
+ return;
+
+ unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
kfree(per_cpu(irq_name, cpu));
per_cpu(irq_name, cpu) = NULL;