aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/crypto/blake2s-glue.c10
-rw-r--r--arch/x86/crypto/chacha_glue.c14
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c2
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c2
-rw-r--r--arch/x86/crypto/poly1305_glue.c13
-rw-r--r--arch/x86/entry/calling.h40
-rw-r--r--arch/x86/entry/entry_64.S14
-rw-r--r--arch/x86/events/intel/core.c4
-rw-r--r--arch/x86/hyperv/hv_init.c31
-rw-r--r--arch/x86/include/asm/cpu_device_id.h30
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/dma.h2
-rw-r--r--arch/x86/include/asm/ftrace.h6
-rw-r--r--arch/x86/include/asm/io_bitmap.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h5
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/pgtable.h1
-rw-r--r--arch/x86/include/asm/set_memory.h19
-rw-r--r--arch/x86/include/asm/stackprotector.h7
-rw-r--r--arch/x86/include/asm/unwind.h2
-rw-r--r--arch/x86/include/uapi/asm/unistd.h11
-rw-r--r--arch/x86/kernel/apic/apic.c27
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/bugs.c198
-rw-r--r--arch/x86/kernel/cpu/common.c56
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/match.c7
-rw-r--r--arch/x86/kernel/cpu/mce/core.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c86
-rw-r--r--arch/x86/kernel/ftrace.c29
-rw-r--r--arch/x86/kernel/ioport.c22
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/unwind_orc.c78
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kvm/ioapic.c10
-rw-r--r--arch/x86/kvm/mmu/mmu.c46
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx/nested.c23
-rw-r--r--arch/x86/kvm/vmx/ops.h1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S3
-rw-r--r--arch/x86/kvm/vmx/vmx.c36
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c30
-rw-r--r--arch/x86/mm/dump_pagetables.c33
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/pat/set_memory.c12
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/xen/smp_pv.c1
53 files changed, 698 insertions, 317 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index beea77046f9b..0bc9a74468be 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 06ef2d4a4701..6737bcea1fa1 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
- for (;;) {
+ do {
const size_t blocks = min_t(size_t, nblocks,
- PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state,
kernel_fpu_end();
nblocks -= blocks;
- if (!nblocks)
- break;
block += blocks * BLAKE2S_BLOCK_SIZE;
- }
+ } while (nblocks);
}
EXPORT_SYMBOL(blake2s_compress_arch);
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 68a74953efaf..ebf2cd7ff2f0 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -154,9 +154,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, bytes, nrounds);
- kernel_fpu_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index f7567cbd35b6..80fcb85736e1 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -29,7 +29,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index a661ede3b5cf..cc6b7c1a2705 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -29,7 +29,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 79bb58737d52..61b2bc8b6986 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
struct poly1305_arch_internal *state = ctx;
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
return;
}
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+ do {
+ const size_t bytes = min_t(size_t, len, SZ_4K);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
kernel_fpu_end();
+
len -= bytes;
- if (!len)
- break;
inp += bytes;
- }
+ } while (len);
}
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 0789e13ece90..1c7f13bb6728 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
- /*
- * Push registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -114,34 +107,43 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rsi /* pt_regs->si */
.endif
pushq \rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
pushq %rcx /* pt_regs->cx */
- xorl %ecx, %ecx /* nospec cx */
pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
- xorl %r8d, %r8d /* nospec r8 */
pushq %r9 /* pt_regs->r9 */
- xorl %r9d, %r9d /* nospec r9 */
pushq %r10 /* pt_regs->r10 */
- xorl %r10d, %r10d /* nospec r10 */
pushq %r11 /* pt_regs->r11 */
- xorl %r11d, %r11d /* nospec r11*/
pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx*/
pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp*/
pushq %r12 /* pt_regs->r12 */
- xorl %r12d, %r12d /* nospec r12*/
pushq %r13 /* pt_regs->r13 */
- xorl %r13d, %r13d /* nospec r13*/
pushq %r14 /* pt_regs->r14 */
- xorl %r14d, %r14d /* nospec r14*/
pushq %r15 /* pt_regs->r15 */
- xorl %r15d, %r15d /* nospec r15*/
UNWIND_HINT_REGS
+
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
+
+ /*
+ * Sanitize registers of values that a speculation attack might
+ * otherwise want to exploit. The lower registers are likely clobbered
+ * well before they could be put to use in a speculative execution
+ * gadget.
+ */
+ xorl %edx, %edx /* nospec dx */
+ xorl %ecx, %ecx /* nospec cx */
+ xorl %r8d, %r8d /* nospec r8 */
+ xorl %r9d, %r9d /* nospec r9 */
+ xorl %r10d, %r10d /* nospec r10 */
+ xorl %r11d, %r11d /* nospec r11 */
+ xorl %ebx, %ebx /* nospec rbx */
+ xorl %ebp, %ebp /* nospec rbp */
+ xorl %r12d, %r12d /* nospec r12 */
+ xorl %r13d, %r13d /* nospec r13 */
+ xorl %r14d, %r14d /* nospec r14 */
+ xorl %r15d, %r15d /* nospec r15 */
+
.endm
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f2bb91e87877..faa53fee0663 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -249,7 +249,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- UNWIND_HINT_EMPTY
POP_REGS pop_rdi=0 skip_r11rcx=1
/*
@@ -258,6 +257,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -279,8 +279,7 @@ SYM_CODE_END(entry_SYSCALL_64)
* %rdi: prev task
* %rsi: next task
*/
-SYM_CODE_START(__switch_to_asm)
- UNWIND_HINT_FUNC
+SYM_FUNC_START(__switch_to_asm)
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -321,7 +320,7 @@ SYM_CODE_START(__switch_to_asm)
popq %rbp
jmp __switch_to
-SYM_CODE_END(__switch_to_asm)
+SYM_FUNC_END(__switch_to_asm)
/*
* A newly forked process directly context switches into this address.
@@ -512,7 +511,7 @@ SYM_CODE_END(spurious_entries_start)
* +----------------------------------------------------+
*/
SYM_CODE_START(interrupt_entry)
- UNWIND_HINT_FUNC
+ UNWIND_HINT_IRET_REGS offset=16
ASM_CLAC
cld
@@ -544,9 +543,9 @@ SYM_CODE_START(interrupt_entry)
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
+ UNWIND_HINT_IRET_REGS
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
- UNWIND_HINT_FUNC
movq (%rdi), %rdi
jmp 2f
@@ -637,6 +636,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -1739,7 +1739,7 @@ SYM_CODE_START(rewind_stack_do_exit)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
- UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
+ UNWIND_HINT_REGS
call do_exit
SYM_CODE_END(rewind_stack_do_exit)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dff6623804c2..dae71ebfa709 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1892,8 +1892,8 @@ static __initconst const u64 tnt_hw_cache_extra_regs
static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
- INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0),
- INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1),
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1),
EVENT_EXTRA_END
};
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 624f5d9b0f79..acf76b466db6 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -73,7 +73,8 @@ static int hv_cpu_init(unsigned int cpu)
struct page *pg;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- pg = alloc_page(GFP_KERNEL);
+ /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
+ pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
if (unlikely(!pg))
return -ENOMEM;
*input_arg = page_address(pg);
@@ -225,10 +226,18 @@ static int hv_cpu_die(unsigned int cpu)
rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
if (re_ctrl.target_vp == hv_vp_index[cpu]) {
- /* Reassign to some other online CPU */
+ /*
+ * Reassign reenlightenment notifications to some other online
+ * CPU or just disable the feature if there are no online CPUs
+ * left (happens on hibernation).
+ */
new_cpu = cpumask_any_but(cpu_online_mask, cpu);
- re_ctrl.target_vp = hv_vp_index[new_cpu];
+ if (new_cpu < nr_cpu_ids)
+ re_ctrl.target_vp = hv_vp_index[new_cpu];
+ else
+ re_ctrl.enabled = 0;
+
wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
}
@@ -254,6 +263,7 @@ static int __init hv_pci_init(void)
static int hv_suspend(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
/*
* Reset the hypercall page as it is going to be invalidated
@@ -270,12 +280,17 @@ static int hv_suspend(void)
hypercall_msr.enable = 0;
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
- return 0;
+ ret = hv_cpu_die(0);
+ return ret;
}
static void hv_resume(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
+
+ ret = hv_cpu_init(0);
+ WARN_ON(ret);
/* Re-enable the hypercall page */
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -286,8 +301,16 @@ static void hv_resume(void)
hv_hypercall_pg = hv_hypercall_pg_saved;
hv_hypercall_pg_saved = NULL;
+
+ /*
+ * Reenlightenment notifications are disabled by hv_cpu_die(0),
+ * reenable them here if hv_reenlightenment_cb was previously set.
+ */
+ if (hv_reenlightenment_cb)
+ set_hv_tscchange_cb(hv_reenlightenment_cb);
}
+/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */
static struct syscore_ops hv_syscore_ops = {
.suspend = hv_suspend,
.resume = hv_resume,
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index 31c379c1da41..0c814cd9ea42 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -9,6 +9,36 @@
#include <linux/mod_devicetable.h>
+#define X86_CENTAUR_FAM6_C7_D 0xd
+#define X86_CENTAUR_FAM6_NANO 0xf
+
+#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
+
+/**
+ * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
+ * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@_vendor
+ * @_family: The family number or X86_FAMILY_ANY
+ * @_model: The model number, model constant or X86_MODEL_ANY
+ * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY
+ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
+ * @_data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ *
+ * Backport version to keep the SRBDS pile consistant. No shorter variants
+ * required for this.
+ */
+#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = X86_VENDOR_##_vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .driver_data = (unsigned long) _data \
+}
+
/*
* Match specific microcode revisions.
*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f3327cb56edf..69f7dcb1fa5c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -360,6 +360,7 @@
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */
#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
@@ -404,5 +405,6 @@
#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 00f7cf45e699..8e95aa4b0d17 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -74,7 +74,7 @@
#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
-#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))
#ifdef CONFIG_X86_32
/* The maximum address that we can perform a DMA transfer to on this platform */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 85be2f506272..89af0d2c62aa 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -56,6 +56,12 @@ struct dyn_arch_ftrace {
#ifndef __ASSEMBLY__
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+extern void set_ftrace_ops_ro(void);
+#else
+static inline void set_ftrace_ops_ro(void) { }
+#endif
+
#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
index 07344d82e88e..ac1a99ffbd8d 100644
--- a/arch/x86/include/asm/io_bitmap.h
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -17,7 +17,7 @@ struct task_struct;
#ifdef CONFIG_X86_IOPL_IOPERM
void io_bitmap_share(struct task_struct *tsk);
-void io_bitmap_exit(void);
+void io_bitmap_exit(struct task_struct *tsk);
void native_tss_update_io_bitmap(void);
@@ -29,7 +29,7 @@ void native_tss_update_io_bitmap(void);
#else
static inline void io_bitmap_share(struct task_struct *tsk) { }
-static inline void io_bitmap_exit(void) { }
+static inline void io_bitmap_exit(struct task_struct *tsk) { }
static inline void tss_update_io_bitmap(void) { }
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d79b40cd8283..c121b8f24597 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -574,6 +574,7 @@ struct kvm_vcpu_arch {
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
+ u32 host_pkru;
u32 pkru;
u32 hflags;
u64 efer;
@@ -1664,8 +1665,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
- return (irq->delivery_mode == dest_Fixed ||
- irq->delivery_mode == dest_LowestPrio);
+ return (irq->delivery_mode == APIC_DM_FIXED ||
+ irq->delivery_mode == APIC_DM_LOWEST);
}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d5e517d1c3dd..af64c8e80ff4 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -119,6 +119,10 @@
#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+/* SRBDS support */
+#define MSR_IA32_MCU_OPT_CTRL 0x00000123
+#define RNGDS_MITG_DIS BIT(0)
+
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
#define MSR_IA32_SYSENTER_EIP 0x00000176
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 64a03f226ab7..dca64a2dda9c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -256,6 +256,7 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
static inline int pmd_trans_huge(pmd_t pmd)
{
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 64c3dce374e5..688d6f1e7a63 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -83,28 +83,35 @@ int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
#ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
unsigned long decoy_addr;
int rc;
/*
- * Mark the linear address as UC to make sure we don't log more
- * errors because of speculative access to the page.
* We would like to just call:
- * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_uc() properly sanitizing any __pa()
+ * This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
- rc = set_memory_uc(decoy_addr, 1);
+ if (unmap)
+ rc = set_memory_np(decoy_addr, 1);
+ else
+ rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 91e29b6a86a5..9804a7957f4e 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -55,8 +55,13 @@
/*
* Initialize the stackprotector canary value.
*
- * NOTE: this must only be called from functions that never return,
+ * NOTE: this must only be called from functions that never return
* and it must always be inlined.
+ *
+ * In addition, it should be called from a compilation unit for which
+ * stack protector is disabled. Alternatively, the caller should not end
+ * with a function call which gets tail-call optimized as that would
+ * lead to checking a modified canary value.
*/
static __always_inline void boot_init_stack_canary(void)
{
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 499578f7e6d7..70fc159ebe69 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -19,7 +19,7 @@ struct unwind_state {
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
- struct pt_regs *regs;
+ struct pt_regs *regs, *prev_regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h
index 196fdd02b8b1..be5e2e747f50 100644
--- a/arch/x86/include/uapi/asm/unistd.h
+++ b/arch/x86/include/uapi/asm/unistd.h
@@ -2,8 +2,15 @@
#ifndef _UAPI_ASM_X86_UNISTD_H
#define _UAPI_ASM_X86_UNISTD_H
-/* x32 syscall flag bit */
-#define __X32_SYSCALL_BIT 0x40000000UL
+/*
+ * x32 syscall flag bit. Some user programs expect syscall NR macros
+ * and __X32_SYSCALL_BIT to have type int, even though syscall numbers
+ * are, for practical purposes, unsigned long.
+ *
+ * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right
+ * thing regardless.
+ */
+#define __X32_SYSCALL_BIT 0x40000000
#ifndef __KERNEL__
# ifdef __i386__
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5f973fed3c9f..e289722b04f6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -552,7 +550,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
#define DEADLINE_MODEL_MATCH_REV(model, rev) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
-static u32 hsx_deadline_rev(void)
+static __init u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
@@ -562,7 +560,7 @@ static u32 hsx_deadline_rev(void)
return ~0U;
}
-static u32 bdx_deadline_rev(void)
+static __init u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
@@ -574,7 +572,7 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
-static u32 skx_deadline_rev(void)
+static __init u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
@@ -587,7 +585,7 @@ static u32 skx_deadline_rev(void)
return ~0U;
}
-static const struct x86_cpu_id deadline_match[] = {
+static const struct x86_cpu_id deadline_match[] __initconst = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev),
@@ -609,18 +607,19 @@ static const struct x86_cpu_id deadline_match[] = {
{},
};
-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;
/*
* Function pointers will have the MSB set due to address layout,
@@ -632,11 +631,12 @@ static void apic_check_deadline_errata(void)
rev = (u32)m->driver_data;
if (boot_cpu_data.microcode >= rev)
- return;
+ return true;
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}
/*
@@ -2098,7 +2098,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
- apic_check_deadline_errata();
+ if (apic_validate_deadline_timer())
+ pr_debug("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1f875fbe1384..f04cc01e629e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1111,8 +1111,7 @@ static const int amd_erratum_383[] =
/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
static const int amd_erratum_1054[] =
- AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ed54b3b21c39..b53dcff21438 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
static void __init mds_print_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init srbds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -108,6 +109,7 @@ void __init check_bugs(void)
l1tf_select_mitigation();
mds_select_mitigation();
taa_select_mitigation();
+ srbds_select_mitigation();
/*
* As MDS and TAA mitigations are inter-related, print MDS
@@ -398,6 +400,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting
+ * TSX that are only exposed to SRBDS when TSX is enabled.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (cpu_mitigations_off() || srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ update_srbds_msr();
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -495,7 +588,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
-static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
#ifdef CONFIG_RETPOLINE
@@ -641,15 +736,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
break;
}
- /*
- * At this point, an STIBP mode other than "off" has been set.
- * If STIBP support is not being forced, check if STIBP always-on
- * is preferred.
- */
- if (mode != SPECTRE_V2_USER_STRICT &&
- boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
@@ -672,23 +758,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
+
+ spectre_v2_user_ibpb = mode;
}
- /* If enhanced IBRS is enabled no STIBP required */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ /*
+ * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * required.
+ */
+ if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
- * If SMT is not possible or STIBP is not available clear the STIBP
- * mode.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if (mode != SPECTRE_V2_USER_STRICT &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ /*
+ * If STIBP is not available, clear the STIBP mode.
*/
- if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
mode = SPECTRE_V2_USER_NONE;
+
+ spectre_v2_user_stibp = mode;
+
set_mode:
- spectre_v2_user = mode;
- /* Only print the STIBP mode when SMT possible */
- if (smt_possible)
- pr_info("%s\n", spectre_v2_user_strings[mode]);
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
}
static const char * const spectre_v2_strings[] = {
@@ -921,7 +1020,7 @@ void cpu_bugs_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
case SPECTRE_V2_USER_STRICT:
@@ -1164,14 +1263,19 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
/*
* Indirect branch speculation is always disabled in strict
- * mode.
+ * mode. It can neither be enabled if it was force-disabled
+ * by a previous prctl call.
+
*/
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ task_spec_ib_force_disable(task))
return -EPERM;
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
@@ -1182,10 +1286,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
* Indirect branch speculation is always allowed when
* mitigation is force disabled.
*/
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
return 0;
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
@@ -1216,7 +1322,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
- if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -1247,22 +1354,24 @@ static int ib_prctl_get(struct task_struct *task)
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return PR_SPEC_NOT_AFFECTED;
- switch (spectre_v2_user) {
- case SPECTRE_V2_USER_NONE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_PRCTL:
- case SPECTRE_V2_USER_SECCOMP:
+ else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_STRICT:
- case SPECTRE_V2_USER_STRICT_PREFERRED:
- return PR_SPEC_DISABLE;
- default:
+ } else
return PR_SPEC_NOT_AFFECTED;
- }
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
@@ -1501,7 +1610,7 @@ static char *stibp_state(void)
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return "";
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
return ", STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
@@ -1528,6 +1637,11 @@ static char *ibpb_state(void)
return "";
}
+static ssize_t srbds_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -1572,6 +1686,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_ITLB_MULTIHIT:
return itlb_multihit_show_state(buf);
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
default:
break;
}
@@ -1618,4 +1735,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr
{
return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4cdb123ff66a..0567448124e1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1075,9 +1075,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
{}
};
-static bool __init cpu_matches(unsigned long which)
+#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
+ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, steppings, \
+ X86_FEATURE_ANY, issues)
+
+#define SRBDS BIT(0)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
{
- const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
+ const struct x86_cpu_id *m = x86_match_cpu(table);
return m && !!(m->driver_data & which);
}
@@ -1097,31 +1118,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
u64 ia32_cap = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
- if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
- if (cpu_matches(NO_SPECULATION))
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
- if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
- if (cpu_matches(MSBDS_ONLY))
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
}
- if (!cpu_matches(NO_SWAPGS))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
/*
@@ -1139,7 +1163,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
- if (cpu_matches(NO_MELTDOWN))
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -1148,7 +1181,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
- if (cpu_matches(NO_L1TF))
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
@@ -1589,6 +1622,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
}
static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 37fdefd14f28..fb538fccd24c 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 6dd78d8235e4..2f163e6646b6 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -34,13 +34,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ for (m = match;
+ m->vendor | m->family | m->model | m->steppings | m->feature;
+ m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
return m;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2c4f949611e4..410d3868bf33 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -527,6 +527,13 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -598,7 +605,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
+ set_mce_nospec(pfn, whole_page(mce));
return NOTIFY_OK;
}
@@ -1096,7 +1103,7 @@ static int do_memory_failure(struct mce *m)
if (ret)
pr_err("Memory error not recovered");
else
- set_mce_nospec(m->addr >> PAGE_SHIFT);
+ set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
return ret;
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a1806598aaa4..cf2f2a85f087 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -954,18 +954,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
return true;
}
-/*
- * This is similar to user_regset_copyout(), but will not add offset to
- * the source data pointer or increment pos, count, kbuf, and ubuf.
- */
-static inline void
-__copy_xstate_to_kernel(void *kbuf, const void *data,
- unsigned int offset, unsigned int size, unsigned int size_total)
+static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
{
- if (offset < size_total) {
- unsigned int copy = min(size, size_total - offset);
+ if (*pos < to) {
+ unsigned size = to - *pos;
+
+ if (size > *count)
+ size = *count;
+ memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
+ }
+}
- memcpy(kbuf + offset, data, copy);
+static void copy_part(unsigned offset, unsigned size, void *from,
+ void **kbuf, unsigned *pos, unsigned *count)
+{
+ fill_gap(offset, kbuf, pos, count);
+ if (size > *count)
+ size = *count;
+ if (size) {
+ memcpy(*kbuf, from, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
}
}
@@ -978,8 +991,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data,
*/
int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
{
- unsigned int offset, size;
struct xstate_header header;
+ const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ unsigned count = size_total;
int i;
/*
@@ -995,46 +1009,42 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
header.xfeatures = xsave->header.xfeatures;
header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(0, off_mxcsr,
+ &xsave->i387, &kbuf, &offset_start, &count);
+ if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
+ copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
+ &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(offsetof(struct fxregs_state, st_space), 128,
+ &xsave->i387.st_space, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_SSE)
+ copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256,
+ &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
+ xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
/*
* Copy xregs_state->header:
*/
- offset = offsetof(struct xregs_state, header);
- size = sizeof(header);
-
- __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
+ copy_part(offsetof(struct xregs_state, header), sizeof(header),
+ &header, &kbuf, &offset_start, &count);
- for (i = 0; i < XFEATURE_MAX; i++) {
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
/*
* Copy only in-use xstates:
*/
if ((header.xfeatures >> i) & 1) {
void *src = __raw_xsave_addr(xsave, i);
- offset = xstate_offsets[i];
- size = xstate_sizes[i];
-
- /* The next component has to fit fully into the output buffer: */
- if (offset + size > size_total)
- break;
-
- __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
+ copy_part(xstate_offsets[i], xstate_sizes[i],
+ src, &kbuf, &offset_start, &count);
}
}
-
- if (xfeatures_mxcsr_quirk(header.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
- }
-
- /*
- * Fill xsave->i387.sw_reserved value for ptrace frame:
- */
- offset = offsetof(struct fxregs_state, sw_reserved);
- size = sizeof(xstate_fx_sw_bytes);
-
- __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
+ fill_gap(size_total, &kbuf, &offset_start, &count);
return 0;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 37a0aeaf89e7..b0e641793be4 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -407,7 +407,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
set_vm_flush_reset_perms(trampoline);
- set_memory_ro((unsigned long)trampoline, npages);
+ if (likely(system_state != SYSTEM_BOOTING))
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -415,6 +416,32 @@ fail:
return 0;
}
+void set_ftrace_ops_ro(void)
+{
+ struct ftrace_ops *ops;
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long npages;
+ unsigned long size;
+
+ do_for_each_ftrace_op(ops, ftrace_ops_list) {
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ continue;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_epilogue;
+ }
+ size = end_offset - start_offset;
+ size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(size, PAGE_SIZE);
+ set_memory_ro((unsigned long)ops->trampoline, npages);
+ } while_for_each_ftrace_op(ops);
+}
+
static unsigned long calc_trampoline_call_offset(bool save_regs)
{
unsigned long start_offset;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8abeee0dd7bf..fce678f4471e 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -32,15 +32,15 @@ void io_bitmap_share(struct task_struct *tsk)
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
-static void task_update_io_bitmap(void)
+static void task_update_io_bitmap(struct task_struct *tsk)
{
- struct thread_struct *t = &current->thread;
+ struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
/* TSS update is handled on exit to user space */
- set_thread_flag(TIF_IO_BITMAP);
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
} else {
- clear_thread_flag(TIF_IO_BITMAP);
+ clear_tsk_thread_flag(tsk, TIF_IO_BITMAP);
/* Invalidate TSS */
preempt_disable();
tss_update_io_bitmap();
@@ -48,12 +48,12 @@ static void task_update_io_bitmap(void)
}
}
-void io_bitmap_exit(void)
+void io_bitmap_exit(struct task_struct *tsk)
{
- struct io_bitmap *iobm = current->thread.io_bitmap;
+ struct io_bitmap *iobm = tsk->thread.io_bitmap;
- current->thread.io_bitmap = NULL;
- task_update_io_bitmap();
+ tsk->thread.io_bitmap = NULL;
+ task_update_io_bitmap(tsk);
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -101,7 +101,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
if (!iobm)
return -ENOMEM;
refcount_set(&iobm->refcnt, 1);
- io_bitmap_exit();
+ io_bitmap_exit(current);
}
/*
@@ -133,7 +133,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
}
/* All permissions dropped? */
if (max_long == UINT_MAX) {
- io_bitmap_exit();
+ io_bitmap_exit(current);
return 0;
}
@@ -191,7 +191,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
}
t->iopl_emul = level;
- task_update_io_bitmap();
+ task_update_io_bitmap(current);
return 0;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3053c85e0e42..3d88300ec306 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,7 +97,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
}
/*
- * Free current thread data structures etc..
+ * Free thread data structures etc..
*/
void exit_thread(struct task_struct *tsk)
{
@@ -105,7 +105,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (test_thread_flag(TIF_IO_BITMAP))
- io_bitmap_exit();
+ io_bitmap_exit(tsk);
free_vm86(t);
@@ -546,28 +546,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
lockdep_assert_irqs_disabled();
- /*
- * If TIF_SSBD is different, select the proper mitigation
- * method. Note that if SSBD mitigation is disabled or permanentely
- * enabled this branch can't be taken because nothing can set
- * TIF_SSBD.
- */
- if (tif_diff & _TIF_SSBD) {
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_ssb_virt_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_core_ssb_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD)) {
- msr |= ssbd_tif_to_spec_ctrl(tifn);
- updmsr = true;
- }
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
}
- /*
- * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
- * otherwise avoid the MSR write.
- */
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
if (IS_ENABLED(CONFIG_SMP) &&
static_branch_unlikely(&switch_to_cond_stibp)) {
updmsr |= !!(tif_diff & _TIF_SPEC_IB);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0cc7c0b106bb..762f5c1465a6 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
},
},
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
{ /* Handle problems with rebooting on Apple MacBookPro5 */
.callback = set_pci_reboot,
.ident = "Apple MacBookPro5",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 69881b2d446c..9674321ce3a3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -262,6 +262,14 @@ static void notrace start_secondary(void *unused)
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+
+ /*
+ * Prevent tail call to cpu_startup_entry() because the stack protector
+ * guard has been changed a couple of function calls up, in
+ * boot_init_stack_canary() and must not be checked before tail calling
+ * another function.
+ */
+ prevent_tail_call_optimization();
}
/**
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index d8673d8a779b..36a585b80d9e 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -25,10 +25,6 @@
#include <asm/hpet.h>
#include <asm/time.h>
-#ifdef CONFIG_X86_64
-__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-#endif
-
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index e9cc182aa97e..1a90abeca5f3 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -142,9 +142,6 @@ static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
- if (!orc_init)
- return NULL;
-
if (ip == 0)
return &null_orc_entry;
@@ -317,12 +314,19 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
+ struct task_struct *task = state->task;
+
if (unwind_done(state))
return NULL;
if (state->regs)
return &state->regs->ip;
+ if (task != current && state->sp == task->thread.sp) {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+ return &frame->ret_addr;
+ }
+
if (state->sp)
return (unsigned long *)state->sp - 1;
@@ -381,9 +385,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
return true;
}
+/*
+ * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
+ * value from state->regs.
+ *
+ * Otherwise, if state->regs just points to IRET regs, and the previous frame
+ * had full regs, it's safe to get the value from the previous regs. This can
+ * happen when early/late IRQ entry code gets interrupted by an NMI.
+ */
+static bool get_reg(struct unwind_state *state, unsigned int reg_off,
+ unsigned long *val)
+{
+ unsigned int reg = reg_off/8;
+
+ if (!state->regs)
+ return false;
+
+ if (state->full_regs) {
+ *val = ((unsigned long *)state->regs)[reg];
+ return true;
+ }
+
+ if (state->prev_regs) {
+ *val = ((unsigned long *)state->prev_regs)[reg];
+ return true;
+ }
+
+ return false;
+}
+
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp;
+ unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
bool indirect = false;
@@ -445,39 +478,35 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_R10:
- if (!state->regs || !state->full_regs) {
+ if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
orc_warn("missing regs for base reg R10 at ip %pB\n",
(void *)state->ip);
goto err;
}
- sp = state->regs->r10;
break;
case ORC_REG_R13:
- if (!state->regs || !state->full_regs) {
+ if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
orc_warn("missing regs for base reg R13 at ip %pB\n",
(void *)state->ip);
goto err;
}
- sp = state->regs->r13;
break;
case ORC_REG_DI:
- if (!state->regs || !state->full_regs) {
+ if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
orc_warn("missing regs for base reg DI at ip %pB\n",
(void *)state->ip);
goto err;
}
- sp = state->regs->di;
break;
case ORC_REG_DX:
- if (!state->regs || !state->full_regs) {
+ if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
orc_warn("missing regs for base reg DX at ip %pB\n",
(void *)state->ip);
goto err;
}
- sp = state->regs->dx;
break;
default:
@@ -504,6 +533,7 @@ bool unwind_next_frame(struct unwind_state *state)
state->sp = sp;
state->regs = NULL;
+ state->prev_regs = NULL;
state->signal = false;
break;
@@ -515,6 +545,7 @@ bool unwind_next_frame(struct unwind_state *state)
}
state->regs = (struct pt_regs *)sp;
+ state->prev_regs = NULL;
state->full_regs = true;
state->signal = true;
break;
@@ -526,6 +557,8 @@ bool unwind_next_frame(struct unwind_state *state)
goto err;
}
+ if (state->full_regs)
+ state->prev_regs = state->regs;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
state->signal = true;
@@ -534,14 +567,14 @@ bool unwind_next_frame(struct unwind_state *state)
default:
orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
orc->type, (void *)orig_ip);
- break;
+ goto err;
}
/* Find BP: */
switch (orc->bp_reg) {
case ORC_REG_UNDEFINED:
- if (state->regs && state->full_regs)
- state->bp = state->regs->bp;
+ if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
+ state->bp = tmp;
break;
case ORC_REG_PREV_SP:
@@ -588,17 +621,20 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
memset(state, 0, sizeof(*state));
state->task = task;
+ if (!orc_init)
+ goto err;
+
/*
* Refuse to unwind the stack of a task while it's executing on another
* CPU. This check is racy, but that's ok: the unwinder has other
* checks to prevent it from going off the rails.
*/
if (task_on_another_cpu(task))
- goto done;
+ goto err;
if (regs) {
if (user_mode(regs))
- goto done;
+ goto the_end;
state->ip = regs->ip;
state->sp = regs->sp;
@@ -631,6 +667,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ state->error = true;
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
@@ -651,13 +688,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
/* Otherwise, skip ahead to the user-specified starting frame: */
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->sp <= (unsigned long)first_frame))
+ state->sp < (unsigned long)first_frame))
unwind_next_frame(state);
return;
-done:
+err:
+ state->error = true;
+the_end:
state->stack_info.type = STACK_TYPE_UNKNOWN;
- return;
}
EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e3296aa028fe..ccb2dec210ef 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -39,13 +39,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
-jiffies = jiffies_64;
#else
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
#endif
+jiffies = jiffies_64;
+
#if defined(CONFIG_X86_64)
/*
* On 64-bit, align RODATA to 2MB so we retain large page mappings for
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 750ff0b29404..d057376bd3d3 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -225,12 +225,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
}
/*
- * AMD SVM AVIC accelerate EOI write and do not trap,
- * in-kernel IOAPIC will not be able to receive the EOI.
- * In this case, we do lazy update of the pending EOI when
- * trying to set IOAPIC irq.
+ * AMD SVM AVIC accelerate EOI write iff the interrupt is edge
+ * triggered, in which case the in-kernel IOAPIC will not be able
+ * to receive the EOI. In this case, we do a lazy update of the
+ * pending EOI when trying to set IOAPIC irq.
*/
- if (kvm_apicv_activated(ioapic->kvm))
+ if (edge && kvm_apicv_activated(ioapic->kvm))
ioapic_lazy_update_eoi(ioapic, irq);
/*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 87e9ba27ada1..ea6fa05e2fd9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -343,6 +343,8 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
shadow_mmio_access_mask = access_mask;
@@ -591,16 +593,15 @@ static void kvm_mmu_reset_all_pte_masks(void)
* the most significant bits of legal physical address space.
*/
shadow_nonpresent_or_rsvd_mask = 0;
- low_phys_bits = boot_cpu_data.x86_cache_bits;
- if (boot_cpu_data.x86_cache_bits <
- 52 - shadow_nonpresent_or_rsvd_mask_len) {
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - shadow_nonpresent_or_rsvd_mask_len;
shadow_nonpresent_or_rsvd_mask =
- rsvd_bits(boot_cpu_data.x86_cache_bits -
- shadow_nonpresent_or_rsvd_mask_len,
- boot_cpu_data.x86_cache_bits - 1);
- low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
- } else
- WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
@@ -6131,25 +6132,16 @@ static void kvm_set_mmio_spte_mask(void)
u64 mask;
/*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
- */
-
- /*
- * Mask the uppermost physical address bit, which would be reserved as
- * long as the supported physical address width is less than 52.
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
*/
- mask = 1ull << 51;
-
- /* Set the present bit. */
- mask |= 1ull;
-
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (shadow_phys_bits == 52)
- mask &= ~1ull;
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 451377533bcb..eee7cb0e1d95 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1886,7 +1886,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
@@ -3236,8 +3236,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+ /* Trap async PF even if not shadowing */
+ if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
return NESTED_EXIT_HOST;
break;
default:
@@ -3326,7 +3326,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
+ /* asid not copied, it is handled manually for svm->vmcb. */
dst->tlb_ctl = from->tlb_ctl;
dst->int_ctl = from->int_ctl;
dst->int_vector = from->int_vector;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index eec7b2d93104..a03db4a75977 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -303,7 +303,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
cpu = get_cpu();
prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, prev);
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
@@ -5504,6 +5504,23 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
return 1 & (b >> (field & 7));
}
+static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
+{
+ u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
+
+ if (nested_cpu_has_mtf(vmcs12))
+ return true;
+
+ /*
+ * An MTF VM-exit may be injected into the guest by setting the
+ * interruption-type to 7 (other event) and the vector field to 0. Such
+ * is the case regardless of the 'monitor trap flag' VM-execution
+ * control.
+ */
+ return entry_intr_info == (INTR_INFO_VALID_MASK
+ | INTR_TYPE_OTHER_EVENT);
+}
+
/*
* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
* should handle it ourselves in L0 (and then continue L2). Only call this
@@ -5545,7 +5562,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
- switch (exit_reason) {
+ switch ((u16)exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
if (is_nmi(intr_info))
return false;
@@ -5618,7 +5635,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
case EXIT_REASON_MONITOR_TRAP_FLAG:
- return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+ return nested_vmx_exit_handled_mtf(vmcs12);
case EXIT_REASON_MONITOR_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
case EXIT_REASON_PAUSE_INSTRUCTION:
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 09b0937d56b1..19717d0a1100 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -12,6 +12,7 @@
#define __ex(x) __kvm_handle_fault_on_reboot(x)
+asmlinkage void vmread_error(unsigned long field, bool fault);
__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 861ae40e7144..99410f372c41 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -86,6 +86,9 @@ SYM_FUNC_START(vmx_vmexit)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
+ or $1, %_ASM_AX
+
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c1ffe7d24f83..b29902c521f2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1314,10 +1314,12 @@ after_clear_sn:
pi_set_on(pi_desc);
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+ struct vmcs *prev;
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
@@ -1336,10 +1338,18 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
local_irq_enable();
}
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ prev = per_cpu(current_vmcs, cpu);
+ if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
- indirect_branch_prediction_barrier();
+
+ /*
+ * No indirect branch prediction barrier needed when switching
+ * the active VMCS within a guest, e.g. on nested VM-Enter.
+ * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ */
+ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+ indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1376,11 +1386,10 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
- vmx->host_pkru = read_pkru();
vmx->host_debugctlmsr = get_debugctlmsr();
}
@@ -6538,11 +6547,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_load_guest_xsave_state(vcpu);
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
- vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vcpu->arch.pkru);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
@@ -6631,18 +6635,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
pt_guest_exit(vmx);
- /*
- * eager fpu is enabled if PKEY is supported and CR4 is switched
- * back on host, so it is safe to read guest PKRU from current
- * XSAVE.
- */
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
- vcpu->arch.pkru = rdpkru();
- if (vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vmx->host_pkru);
- }
-
kvm_load_host_xsave_state(vcpu);
vmx->nested.nested_run_pending = 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e64da06c7009..ff7361aa824c 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -320,7 +320,8 @@ struct kvm_vmx {
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 17650bda4331..4b4a8a4e0251 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -809,11 +809,25 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.pkru);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.host_pkru);
+ }
+
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
@@ -3529,6 +3543,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
@@ -3722,7 +3739,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
- if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
@@ -4551,7 +4568,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (kvm_state.flags &
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
- | KVM_STATE_NESTED_EVMCS))
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING))
break;
/* nested_run_pending implies guest_mode. */
@@ -6891,7 +6908,7 @@ restart:
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r && ctxt->tf)
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops->update_emulated_instruction)
kvm_x86_ops->update_emulated_instruction(vcpu);
@@ -8098,9 +8115,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- bool blockable)
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
{
unsigned long apic_address;
@@ -8111,8 +8127,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-
- return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 69309cd56fdf..33093fdedb02 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr)
(void *)st->start_address);
}
-static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
{
- return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
- ((prot1 | prot2) & _PAGE_NX);
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t prot = val & PTE_FLAGS_MASK;
+ pgprotval_t effective;
+
+ if (level > 0) {
+ pgprotval_t higher_prot = st->prot_levels[level - 1];
+
+ effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
+ ((higher_prot | prot) & _PAGE_NX);
+ } else {
+ effective = prot;
+ }
+
+ st->prot_levels[level] = effective;
}
/*
@@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
struct seq_file *m = st->seq;
new_prot = val & PTE_FLAGS_MASK;
-
- if (level > 0) {
- new_eff = effective_prot(st->prot_levels[level - 1],
- new_prot);
- } else {
- new_eff = new_prot;
- }
-
- if (level >= 0)
- st->prot_levels[level] = new_eff;
+ if (!val)
+ new_eff = 0;
+ else
+ new_eff = st->prot_levels[level];
/*
* If we have a "break" in the series, we need to flush the state that
@@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
.note_page = note_page,
+ .effective_prot = effective_prot,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index abbdecb75fad..023e1ec5e153 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,7 @@
#include <asm/init.h>
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include <asm/ftrace.h>
#include "mm_internal.h"
@@ -1288,6 +1289,8 @@ void mark_rodata_ro(void)
all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+ set_ftrace_ops_ro();
+
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
set_memory_rw(start, (end-start) >> PAGE_SHIFT);
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 673de6063345..92530af38b09 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -372,7 +372,7 @@ static void enter_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL &&
+ if (!cpumask_available(downed_cpus) &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
pr_notice("Failed to allocate mask\n");
goto out;
@@ -402,7 +402,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
+ if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c4aedd00c1ba..7ab317e3184e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -42,7 +42,8 @@ struct cpa_data {
unsigned long pfn;
unsigned int flags;
unsigned int force_split : 1,
- force_static_prot : 1;
+ force_static_prot : 1,
+ force_flush_all : 1;
struct page **pages;
};
@@ -352,10 +353,10 @@ static void cpa_flush(struct cpa_data *data, int cache)
return;
}
- if (cpa->numpages <= tlb_single_page_flush_ceiling)
- on_each_cpu(__cpa_flush_tlb, cpa, 1);
- else
+ if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
flush_tlb_all();
+ else
+ on_each_cpu(__cpa_flush_tlb, cpa, 1);
if (!cache)
return;
@@ -1595,6 +1596,8 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ cpa->force_flush_all = 1;
+
ret = __change_page_attr_set_clr(&alias_cpa, 0);
if (ret)
return ret;
@@ -1615,6 +1618,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
* return value.
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e723559c386a..0c67a5a94de3 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -572,6 +572,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar);
/*
* Device [1022:7808]
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 802ee5bba66c..0cebe5db691d 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -92,6 +92,7 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+ prevent_tail_call_optimization();
}
void xen_smp_intr_free_pv(unsigned int cpu)