diff options
Diffstat (limited to 'common/recipes-kernel')
34 files changed, 4041 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0001-KVM-Fix-stack-out-of-bounds-read-in-write_mmio.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0001-KVM-Fix-stack-out-of-bounds-read-in-write_mmio.patch new file mode 100644 index 00000000..9772c5f8 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0001-KVM-Fix-stack-out-of-bounds-read-in-write_mmio.patch @@ -0,0 +1,165 @@ +From af0e9ccc133f03f5150a7afba349a9f50897f793 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Thu, 14 Dec 2017 17:40:50 -0800 +Subject: [PATCH 01/33] KVM: Fix stack-out-of-bounds read in write_mmio +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit e39d200fa5bf5b94a0948db0dae44c1b73b84a56 upstream. + +Reported by syzkaller: + + BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm] + Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298 + + CPU: 6 PID: 32298 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #18 + Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 + Call Trace: + dump_stack+0xab/0xe1 + print_address_description+0x6b/0x290 + kasan_report+0x28a/0x370 + write_mmio+0x11e/0x270 [kvm] + emulator_read_write_onepage+0x311/0x600 [kvm] + emulator_read_write+0xef/0x240 [kvm] + emulator_fix_hypercall+0x105/0x150 [kvm] + em_hypercall+0x2b/0x80 [kvm] + x86_emulate_insn+0x2b1/0x1640 [kvm] + x86_emulate_instruction+0x39a/0xb90 [kvm] + handle_exception+0x1b4/0x4d0 [kvm_intel] + vcpu_enter_guest+0x15a0/0x2640 [kvm] + kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm] + kvm_vcpu_ioctl+0x479/0x880 [kvm] + do_vfs_ioctl+0x142/0x9a0 + SyS_ioctl+0x74/0x80 + entry_SYSCALL_64_fastpath+0x23/0x9a + +The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall) +to the guest memory, however, write_mmio tracepoint always prints 8 bytes +through *(u64 *)val since kvm splits the mmio access into 8 bytes. This +leaks 5 bytes from the kernel stack (CVE-2017-17741). This patch fixes +it by just accessing the bytes which we operate on. + +Before patch: + +syz-executor-5567 [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f + +After patch: + +syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f + +Reported-by: Dmitry Vyukov <dvyukov@google.com> +Reviewed-by: Darren Kenny <darren.kenny@oracle.com> +Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> +Tested-by: Marc Zyngier <marc.zyngier@arm.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Marc Zyngier <marc.zyngier@arm.com> +Cc: Christoffer Dall <christoffer.dall@linaro.org> +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/arm/kvm/mmio.c | 6 +++--- + arch/x86/kvm/x86.c | 8 ++++---- + include/trace/events/kvm.h | 7 +++++-- + 3 files changed, 12 insertions(+), 9 deletions(-) + +diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c +index b6e715f..dac7ceb 100644 +--- a/arch/arm/kvm/mmio.c ++++ b/arch/arm/kvm/mmio.c +@@ -112,7 +112,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) + } + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, +- data); ++ &data); + data = vcpu_data_host_to_guest(vcpu, data, len); + vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); + } +@@ -182,14 +182,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), + len); + +- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); ++ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); + kvm_mmio_write_buf(data_buf, len, data); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } else { + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, +- fault_ipa, 0); ++ fault_ipa, NULL); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 51a700a..9cc9117 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -4242,7 +4242,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) + addr, n, v)) + && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) + break; +- trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); ++ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); + handled += n; + addr += n; + len -= n; +@@ -4495,7 +4495,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) + { + if (vcpu->mmio_read_completed) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, +- vcpu->mmio_fragments[0].gpa, *(u64 *)val); ++ vcpu->mmio_fragments[0].gpa, val); + vcpu->mmio_read_completed = 0; + return 1; + } +@@ -4517,14 +4517,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + + static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) + { +- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); ++ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); + return vcpu_mmio_write(vcpu, gpa, bytes, val); + } + + static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) + { +- trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); ++ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); + return X86EMUL_IO_NEEDED; + } + +diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h +index 8ade3eb..90fce4d 100644 +--- a/include/trace/events/kvm.h ++++ b/include/trace/events/kvm.h +@@ -208,7 +208,7 @@ TRACE_EVENT(kvm_ack_irq, + { KVM_TRACE_MMIO_WRITE, "write" } + + TRACE_EVENT(kvm_mmio, +- TP_PROTO(int type, int len, u64 gpa, u64 val), ++ TP_PROTO(int type, int len, u64 gpa, void *val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( +@@ -222,7 +222,10 @@ TRACE_EVENT(kvm_mmio, + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; +- __entry->val = val; ++ __entry->val = 0; ++ if (val) ++ memcpy(&__entry->val, val, ++ min_t(u32, sizeof(__entry->val), len)); + ), + + TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0002-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0002-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch new file mode 100644 index 00000000..406a79d3 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0002-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch @@ -0,0 +1,97 @@ +From 1cd771013c357075c745f99419bdaf31503c5a51 Mon Sep 17 00:00:00 2001 +From: Jim Mattson <jmattson@google.com> +Date: Wed, 3 Jan 2018 14:31:38 -0800 +Subject: [PATCH 02/33] kvm: vmx: Scrub hardware GPRs at VM-exit + +commit 0cb5b30698fdc8f6b4646012e3acb4ddce430788 upstream. + +Guest GPR values are live in the hardware GPRs at VM-exit. Do not +leave any guest values in hardware GPRs after the guest GPR values are +saved to the vcpu_vmx structure. + +This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753. +Specifically, it defeats the Project Zero PoC for CVE 2017-5715. + +Suggested-by: Eric Northup <digitaleric@google.com> +Signed-off-by: Jim Mattson <jmattson@google.com> +Reviewed-by: Eric Northup <digitaleric@google.com> +Reviewed-by: Benjamin Serebrin <serebrin@google.com> +Reviewed-by: Andrew Honig <ahonig@google.com> +[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>] +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/svm.c | 19 +++++++++++++++++++ + arch/x86/kvm/vmx.c | 14 +++++++++++++- + 2 files changed, 32 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 975ea99..491f077 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -4858,6 +4858,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + "mov %%r14, %c[r14](%[svm]) \n\t" + "mov %%r15, %c[r15](%[svm]) \n\t" + #endif ++ /* ++ * Clear host registers marked as clobbered to prevent ++ * speculative use. ++ */ ++ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" ++ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" ++ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" ++ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" ++ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" ++#ifdef CONFIG_X86_64 ++ "xor %%r8, %%r8 \n\t" ++ "xor %%r9, %%r9 \n\t" ++ "xor %%r10, %%r10 \n\t" ++ "xor %%r11, %%r11 \n\t" ++ "xor %%r12, %%r12 \n\t" ++ "xor %%r13, %%r13 \n\t" ++ "xor %%r14, %%r14 \n\t" ++ "xor %%r15, %%r15 \n\t" ++#endif + "pop %%" _ASM_BP + : + : [svm]"a"(svm), +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 4ead27f..91ae4e2 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8932,6 +8932,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + /* Save guest registers, load host registers, keep flags */ + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" + "pop %0 \n\t" ++ "setbe %c[fail](%0)\n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" +@@ -8948,12 +8949,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" ++ "xor %%r8d, %%r8d \n\t" ++ "xor %%r9d, %%r9d \n\t" ++ "xor %%r10d, %%r10d \n\t" ++ "xor %%r11d, %%r11d \n\t" ++ "xor %%r12d, %%r12d \n\t" ++ "xor %%r13d, %%r13d \n\t" ++ "xor %%r14d, %%r14d \n\t" ++ "xor %%r15d, %%r15d \n\t" + #endif + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + ++ "xor %%eax, %%eax \n\t" ++ "xor %%ebx, %%ebx \n\t" ++ "xor %%esi, %%esi \n\t" ++ "xor %%edi, %%edi \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" +- "setbe %c[fail](%0) \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0003-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0003-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch new file mode 100644 index 00000000..b53db2f4 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0003-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch @@ -0,0 +1,45 @@ +From ab442dfc820b6ebdbb1c135e6fad66130d44e5a8 Mon Sep 17 00:00:00 2001 +From: Andrew Honig <ahonig@google.com> +Date: Wed, 10 Jan 2018 10:12:03 -0800 +Subject: [PATCH 03/33] KVM: x86: Add memory barrier on vmcs field lookup + +commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream. + +This adds a memory barrier when performing a lookup into +the vmcs_field_to_offset_table. This is related to +CVE-2017-5753. + +Signed-off-by: Andrew Honig <ahonig@google.com> +Reviewed-by: Jim Mattson <jmattson@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 91ae4e2..ee766c2 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -858,8 +858,16 @@ static inline short vmcs_field_to_offset(unsigned long field) + { + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + +- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || +- vmcs_field_to_offset_table[field] == 0) ++ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) ++ return -ENOENT; ++ ++ /* ++ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a ++ * generic mechanism. ++ */ ++ asm("lfence"); ++ ++ if (vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + + return vmcs_field_to_offset_table[field]; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0004-KVM-x86-emulator-Return-to-user-mode-on-L1-CPL-0-emu.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0004-KVM-x86-emulator-Return-to-user-mode-on-L1-CPL-0-emu.patch new file mode 100644 index 00000000..dd1f4c29 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0004-KVM-x86-emulator-Return-to-user-mode-on-L1-CPL-0-emu.patch @@ -0,0 +1,48 @@ +From ce7bea11dfe01825a2ced79b5bcc04b7e781e63b Mon Sep 17 00:00:00 2001 +From: Liran Alon <liran.alon@oracle.com> +Date: Sun, 5 Nov 2017 16:56:33 +0200 +Subject: [PATCH 04/33] KVM: x86: emulator: Return to user-mode on L1 CPL=0 + emulation failure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit 1f4dcb3b213235e642088709a1c54964d23365e9 ] + +On this case, handle_emulation_failure() fills kvm_run with +internal-error information which it expects to be delivered +to user-mode for further processing. +However, the code reports a wrong return-value which makes KVM to never +return to user-mode on this scenario. + +Fixes: 6d77dbfc88e3 ("KVM: inject #UD if instruction emulation fails and exit to +userspace") + +Signed-off-by: Liran Alon <liran.alon@oracle.com> +Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/x86.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 9cc9117..abbb37a 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -5265,7 +5265,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; +- r = EMULATE_FAIL; ++ r = EMULATE_USER_EXIT; + } + kvm_queue_exception(vcpu, UD_VECTOR); + +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-KVM-x86-Don-t-re-execute-instruction-when-not-passin.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-KVM-x86-Don-t-re-execute-instruction-when-not-passin.patch new file mode 100644 index 00000000..49770e88 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-KVM-x86-Don-t-re-execute-instruction-when-not-passin.patch @@ -0,0 +1,63 @@ +From 585df9100649b5038250e1c33cf8af019a77844c Mon Sep 17 00:00:00 2001 +From: Liran Alon <liran.alon@oracle.com> +Date: Sun, 5 Nov 2017 16:56:34 +0200 +Subject: [PATCH 05/33] KVM: x86: Don't re-execute instruction when not passing + CR2 value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit 9b8ae63798cb97e785a667ff27e43fa6220cb734 ] + +In case of instruction-decode failure or emulation failure, +x86_emulate_instruction() will call reexecute_instruction() which will +attempt to use the cr2 value passed to x86_emulate_instruction(). +However, when x86_emulate_instruction() is called from +emulate_instruction(), cr2 is not passed (passed as 0) and therefore +it doesn't make sense to execute reexecute_instruction() logic at all. + +Fixes: 51d8b66199e9 ("KVM: cleanup emulate_instruction") + +Signed-off-by: Liran Alon <liran.alon@oracle.com> +Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/kvm_host.h | 3 ++- + arch/x86/kvm/vmx.c | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index bdde807..6f6ee68 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1113,7 +1113,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + static inline int emulate_instruction(struct kvm_vcpu *vcpu, + int emulation_type) + { +- return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); ++ return x86_emulate_instruction(vcpu, 0, ++ emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); + } + + void kvm_enable_efer_bits(u64); +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index ee766c2..8e5001d 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -6232,7 +6232,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + return 1; + +- err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); ++ err = emulate_instruction(vcpu, 0); + + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0006-KVM-X86-Fix-operand-address-size-during-instruction-.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0006-KVM-X86-Fix-operand-address-size-during-instruction-.patch new file mode 100644 index 00000000..9430b597 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0006-KVM-X86-Fix-operand-address-size-during-instruction-.patch @@ -0,0 +1,67 @@ +From 399e9dee4411858aa4eb8894f031ff68ab3b5e9f Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Sun, 5 Nov 2017 16:54:47 -0800 +Subject: [PATCH 06/33] KVM: X86: Fix operand/address-size during instruction + decoding +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit 3853be2603191829b442b64dac6ae8ba0c027bf9 ] + +Pedro reported: + During tests that we conducted on KVM, we noticed that executing a "PUSH %ES" + instruction under KVM produces different results on both memory and the SP + register depending on whether EPT support is enabled. With EPT the SP is + reduced by 4 bytes (and the written value is 0-padded) but without EPT support + it is only reduced by 2 bytes. The difference can be observed when the CS.DB + field is 1 (32-bit) but not when it's 0 (16-bit). + +The internal segment descriptor cache exist even in real/vm8096 mode. The CS.D +also should be respected instead of just default operand/address-size/66H +prefix/67H prefix during instruction decoding. This patch fixes it by also +adjusting operand/address-size according to CS.D. + +Reported-by: Pedro Fonseca <pfonseca@cs.washington.edu> +Tested-by: Pedro Fonseca <pfonseca@cs.washington.edu> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Nadav Amit <nadav.amit@gmail.com> +Cc: Pedro Fonseca <pfonseca@cs.washington.edu> +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/emulate.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index 9f676ad..9984daf 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -4971,6 +4971,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) + bool op_prefix = false; + bool has_seg_override = false; + struct opcode opcode; ++ u16 dummy; ++ struct desc_struct desc; + + ctxt->memop.type = OP_NONE; + ctxt->memopp = NULL; +@@ -4989,6 +4991,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: ++ def_op_bytes = def_ad_bytes = 2; ++ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); ++ if (desc.d) ++ def_op_bytes = def_ad_bytes = 4; ++ break; + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0007-KVM-x86-ioapic-Fix-level-triggered-EOI-and-IOAPIC-re.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0007-KVM-x86-ioapic-Fix-level-triggered-EOI-and-IOAPIC-re.patch new file mode 100644 index 00000000..2ca432cf --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0007-KVM-x86-ioapic-Fix-level-triggered-EOI-and-IOAPIC-re.patch @@ -0,0 +1,72 @@ +From 34cbfb000e9bd72eb48fb3d1e61be034053f743f Mon Sep 17 00:00:00 2001 +From: Nikita Leshenko <nikita.leshchenko@oracle.com> +Date: Sun, 5 Nov 2017 15:52:29 +0200 +Subject: [PATCH 07/33] KVM: x86: ioapic: Fix level-triggered EOI and IOAPIC + reconfigure race +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit 0fc5a36dd6b345eb0d251a65c236e53bead3eef7 ] + +KVM uses ioapic_handled_vectors to track vectors that need to notify the +IOAPIC on EOI. The problem is that IOAPIC can be reconfigured while an +interrupt with old configuration is pending or running and +ioapic_handled_vectors only remembers the newest configuration; +thus EOI from the old interrupt is not delievered to the IOAPIC. + +A previous commit db2bdcbbbd32 +("KVM: x86: fix edge EOI and IOAPIC reconfig race") +addressed this issue by adding pending edge-triggered interrupts to +ioapic_handled_vectors, fixing this race for edge-triggered interrupts. +The commit explicitly ignored level-triggered interrupts, +but this race applies to them as well: + +1) IOAPIC sends a level triggered interrupt vector to VCPU0 +2) VCPU0's handler deasserts the irq line and reconfigures the IOAPIC + to route the vector to VCPU1. The reconfiguration rewrites only the + upper 32 bits of the IOREDTBLn register. (Causes KVM to update + ioapic_handled_vectors for VCPU0 and it no longer includes the vector.) +3) VCPU0 sends EOI for the vector, but it's not delievered to the + IOAPIC because the ioapic_handled_vectors doesn't include the vector. +4) New interrupts are not delievered to VCPU1 because remote_irr bit + is set forever. + +Therefore, the correct behavior is to add all pending and running +interrupts to ioapic_handled_vectors. + +This commit introduces a slight performance hit similar to +commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") +for the rare case that the vector is reused by a non-IOAPIC source on +VCPU0. We prefer to keep solution simple and not handle this case just +as the original commit does. + +Fixes: db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") + +Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Liran Alon <liran.alon@oracle.com> +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/ioapic.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c +index 6e219e5..a7ac868 100644 +--- a/arch/x86/kvm/ioapic.c ++++ b/arch/x86/kvm/ioapic.c +@@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) + index == RTC_GSI) { + if (kvm_apic_match_dest(vcpu, NULL, 0, + e->fields.dest_id, e->fields.dest_mode) || +- (e->fields.trig_mode == IOAPIC_EDGE_TRIG && +- kvm_apic_pending_eoi(vcpu, e->fields.vector))) ++ kvm_apic_pending_eoi(vcpu, e->fields.vector)) + __set_bit(e->fields.vector, + ioapic_handled_vectors); + } +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0008-KVM-x86-ioapic-Clear-Remote-IRR-when-entry-is-switch.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0008-KVM-x86-ioapic-Clear-Remote-IRR-when-entry-is-switch.patch new file mode 100644 index 00000000..6e097d05 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0008-KVM-x86-ioapic-Clear-Remote-IRR-when-entry-is-switch.patch @@ -0,0 +1,64 @@ +From aca211b549c07b81295e817e663a61a1ae1fd659 Mon Sep 17 00:00:00 2001 +From: Nikita Leshenko <nikita.leshchenko@oracle.com> +Date: Sun, 5 Nov 2017 15:52:32 +0200 +Subject: [PATCH 08/33] KVM: x86: ioapic: Clear Remote IRR when entry is + switched to edge-triggered +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit a8bfec2930525808c01f038825d1df3904638631 ] + +Some OSes (Linux, Xen) use this behavior to clear the Remote IRR bit for +IOAPICs without an EOI register. They simulate the EOI message manually +by changing the trigger mode to edge and then back to level, with the +entry being masked during this. + +QEMU implements this feature in commit ed1263c363c9 +("ioapic: clear remote irr bit for edge-triggered interrupts") + +As a side effect, this commit removes an incorrect behavior where Remote +IRR was cleared when the redirection table entry was rewritten. This is not +consistent with the manual and also opens an opportunity for a strange +behavior when a redirection table entry is modified from an interrupt +handler that handles the same entry: The modification will clear the +Remote IRR bit even though the interrupt handler is still running. + +Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Liran Alon <liran.alon@oracle.com> +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com> +Reviewed-by: Steve Rutherford <srutherford@google.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/ioapic.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c +index a7ac868..4b573c8 100644 +--- a/arch/x86/kvm/ioapic.c ++++ b/arch/x86/kvm/ioapic.c +@@ -306,8 +306,17 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) + } else { + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; +- e->fields.remote_irr = 0; + } ++ ++ /* ++ * Some OSes (Linux, Xen) assume that Remote IRR bit will ++ * be cleared by IOAPIC hardware when the entry is configured ++ * as edge-triggered. This behavior is used to simulate an ++ * explicit EOI on IOAPICs that don't have the EOI register. ++ */ ++ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) ++ e->fields.remote_irr = 0; ++ + mask_after = e->fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-KVM-x86-ioapic-Preserve-read-only-values-in-the-redi.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-KVM-x86-ioapic-Preserve-read-only-values-in-the-redi.patch new file mode 100644 index 00000000..071eccd3 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-KVM-x86-ioapic-Preserve-read-only-values-in-the-redi.patch @@ -0,0 +1,61 @@ +From a4337b660fe26046e81471186dc393ca77371b83 Mon Sep 17 00:00:00 2001 +From: Nikita Leshenko <nikita.leshchenko@oracle.com> +Date: Sun, 5 Nov 2017 15:52:33 +0200 +Subject: [PATCH 09/33] KVM: x86: ioapic: Preserve read-only values in the + redirection table +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit b200dded0a6974a3b69599832b2203483920ab25 ] + +According to 82093AA (IOAPIC) manual, Remote IRR and Delivery Status are +read-only. QEMU implements the bits as RO in commit 479c2a1cb7fb +("ioapic: keep RO bits for IOAPIC entry"). + +Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Liran Alon <liran.alon@oracle.com> +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com> +Reviewed-by: Steve Rutherford <srutherford@google.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/ioapic.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c +index 4b573c8..5f810bb 100644 +--- a/arch/x86/kvm/ioapic.c ++++ b/arch/x86/kvm/ioapic.c +@@ -278,6 +278,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) + { + unsigned index; + bool mask_before, mask_after; ++ int old_remote_irr, old_delivery_status; + union kvm_ioapic_redirect_entry *e; + + switch (ioapic->ioregsel) { +@@ -300,6 +301,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) + return; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; ++ /* Preserve read-only fields */ ++ old_remote_irr = e->fields.remote_irr; ++ old_delivery_status = e->fields.delivery_status; + if (ioapic->ioregsel & 1) { + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; +@@ -307,6 +311,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + } ++ e->fields.remote_irr = old_remote_irr; ++ e->fields.delivery_status = old_delivery_status; + + /* + * Some OSes (Linux, Xen) assume that Remote IRR bit will +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0010-KVM-VMX-Fix-rflags-cache-during-vCPU-reset.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0010-KVM-VMX-Fix-rflags-cache-during-vCPU-reset.patch new file mode 100644 index 00000000..7ab25b0b --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0010-KVM-VMX-Fix-rflags-cache-during-vCPU-reset.patch @@ -0,0 +1,103 @@ +From fc18f773d54edfedf8875473d8e69753265a3dfd Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Mon, 20 Nov 2017 14:52:21 -0800 +Subject: [PATCH 10/33] KVM: VMX: Fix rflags cache during vCPU reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit c37c28730bb031cc8a44a130c2555c0f3efbe2d0 ] + +Reported by syzkaller: + + *** Guest State *** + CR0: actual=0x0000000080010031, shadow=0x0000000060000010, gh_mask=fffffffffffffff7 + CR4: actual=0x0000000000002061, shadow=0x0000000000000000, gh_mask=ffffffffffffe8f1 + CR3 = 0x000000002081e000 + RSP = 0x000000000000fffa RIP = 0x0000000000000000 + RFLAGS=0x00023000 DR7 = 0x00000000000000 + ^^^^^^^^^^ + ------------[ cut here ]------------ + WARNING: CPU: 6 PID: 24431 at /home/kernel/linux/arch/x86/kvm//x86.c:7302 kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] + CPU: 6 PID: 24431 Comm: reprotest Tainted: G W OE 4.14.0+ #26 + RIP: 0010:kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] + RSP: 0018:ffff880291d179e0 EFLAGS: 00010202 + Call Trace: + kvm_vcpu_ioctl+0x479/0x880 [kvm] + do_vfs_ioctl+0x142/0x9a0 + SyS_ioctl+0x74/0x80 + entry_SYSCALL_64_fastpath+0x23/0x9a + +The failed vmentry is triggered by the following beautified testcase: + + #include <unistd.h> + #include <sys/syscall.h> + #include <string.h> + #include <stdint.h> + #include <linux/kvm.h> + #include <fcntl.h> + #include <sys/ioctl.h> + + long r[5]; + int main() + { + struct kvm_debugregs dr = { 0 }; + + r[2] = open("/dev/kvm", O_RDONLY); + r[3] = ioctl(r[2], KVM_CREATE_VM, 0); + r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); + struct kvm_guest_debug debug = { + .control = 0xf0403, + .arch = { + .debugreg[6] = 0x2, + .debugreg[7] = 0x2 + } + }; + ioctl(r[4], KVM_SET_GUEST_DEBUG, &debug); + ioctl(r[4], KVM_RUN, 0); + } + +which testcase tries to setup the processor specific debug +registers and configure vCPU for handling guest debug events through +KVM_SET_GUEST_DEBUG. The KVM_SET_GUEST_DEBUG ioctl will get and set +rflags in order to set TF bit if single step is needed. All regs' caches +are reset to avail and GUEST_RFLAGS vmcs field is reset to 0x2 during vCPU +reset. However, the cache of rflags is not reset during vCPU reset. The +function vmx_get_rflags() returns an unreset rflags cache value since +the cache is marked avail, it is 0 after boot. Vmentry fails if the +rflags reserved bit 1 is 0. + +This patch fixes it by resetting both the GUEST_RFLAGS vmcs field and +its cache to 0x2 during vCPU reset. + +Reported-by: Dmitry Vyukov <dvyukov@google.com> +Tested-by: Dmitry Vyukov <dvyukov@google.com> +Reviewed-by: David Hildenbrand <david@redhat.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Nadav Amit <nadav.amit@gmail.com> +Cc: Dmitry Vyukov <dvyukov@google.com> +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 8e5001d..98f6545 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -5171,7 +5171,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + } + +- vmcs_writel(GUEST_RFLAGS, 0x02); ++ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + vmcs_writel(GUEST_GDTR_BASE, 0); +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0011-KVM-x86-Make-indirect-calls-in-emulator-speculation-.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0011-KVM-x86-Make-indirect-calls-in-emulator-speculation-.patch new file mode 100644 index 00000000..4e1d906b --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0011-KVM-x86-Make-indirect-calls-in-emulator-speculation-.patch @@ -0,0 +1,82 @@ +From adbb63b59bd2792df649335e7d3c28be2fbbe1c2 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 25 Jan 2018 10:58:13 +0100 +Subject: [PATCH 11/33] KVM: x86: Make indirect calls in emulator speculation + safe + +(cherry picked from commit 1a29b5b7f347a1a9230c1e0af5b37e3e571588ab) + +Replace the indirect calls with CALL_NOSPEC. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: rga@amazon.de +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Jason Baron <jbaron@akamai.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Link: https://lkml.kernel.org/r/20180125095843.595615683@infradead.org +[dwmw2: Use ASM_CALL_CONSTRAINT like upstream, now we have it] +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/emulate.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index 9984daf..6faac71 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -25,6 +25,7 @@ + #include <asm/kvm_emulate.h> + #include <linux/stringify.h> + #include <asm/debugreg.h> ++#include <asm/nospec-branch.h> + + #include "x86.h" + #include "tss.h" +@@ -1012,8 +1013,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; +- asm("push %[flags]; popf; call *%[fastop]" +- : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); ++ asm("push %[flags]; popf; " CALL_NOSPEC ++ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); + return rc; + } + +@@ -5287,15 +5288,14 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, + + static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) + { +- register void *__sp asm(_ASM_SP); + ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + +- asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" ++ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), +- [fastop]"+S"(fop), "+r"(__sp) ++ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT + : "c"(ctxt->src2.val)); + + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0012-KVM-VMX-Make-indirect-call-speculation-safe.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0012-KVM-VMX-Make-indirect-call-speculation-safe.patch new file mode 100644 index 00000000..ba052d9e --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0012-KVM-VMX-Make-indirect-call-speculation-safe.patch @@ -0,0 +1,60 @@ +From 9eee1ba493f5899d7c3793818db16deaf084df21 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 25 Jan 2018 10:58:14 +0100 +Subject: [PATCH 12/33] KVM: VMX: Make indirect call speculation safe + +(cherry picked from commit c940a3fb1e2e9b7d03228ab28f375fb5a47ff699) + +Replace indirect call with CALL_NOSPEC. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: rga@amazon.de +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Jason Baron <jbaron@akamai.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Link: https://lkml.kernel.org/r/20180125095843.645776917@infradead.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 98f6545..6f3ed0e 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8659,14 +8659,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) + #endif + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" +- "call *%[entry]\n\t" ++ CALL_NOSPEC + : + #ifdef CONFIG_X86_64 + [sp]"=&r"(tmp), + #endif + "+r"(__sp) + : +- [entry]"r"(entry), ++ THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0013-x86-kvm-Update-spectre-v1-mitigation.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0013-x86-kvm-Update-spectre-v1-mitigation.patch new file mode 100644 index 00000000..8b58f32e --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0013-x86-kvm-Update-spectre-v1-mitigation.patch @@ -0,0 +1,72 @@ +From 7a1d0c7758b49b1f107157db33df0aae1c10cf26 Mon Sep 17 00:00:00 2001 +From: Dan Williams <dan.j.williams@intel.com> +Date: Wed, 31 Jan 2018 17:47:03 -0800 +Subject: [PATCH 13/33] x86/kvm: Update spectre-v1 mitigation + +(cherry picked from commit 085331dfc6bbe3501fb936e657331ca943827600) + +Commit 75f139aaf896 "KVM: x86: Add memory barrier on vmcs field lookup" +added a raw 'asm("lfence");' to prevent a bounds check bypass of +'vmcs_field_to_offset_table'. + +The lfence can be avoided in this path by using the array_index_nospec() +helper designed for these types of fixes. + +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andrew Honig <ahonig@google.com> +Cc: kvm@vger.kernel.org +Cc: Jim Mattson <jmattson@google.com> +Link: https://lkml.kernel.org/r/151744959670.6342.3001723920950249067.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 20 +++++++++----------- + 1 file changed, 9 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 6f3ed0e..af90bc4 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -33,6 +33,7 @@ + #include <linux/slab.h> + #include <linux/tboot.h> + #include <linux/hrtimer.h> ++#include <linux/nospec.h> + #include "kvm_cache_regs.h" + #include "x86.h" + +@@ -856,21 +857,18 @@ static const unsigned short vmcs_field_to_offset_table[] = { + + static inline short vmcs_field_to_offset(unsigned long field) + { +- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); ++ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); ++ unsigned short offset; + +- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) ++ BUILD_BUG_ON(size > SHRT_MAX); ++ if (field >= size) + return -ENOENT; + +- /* +- * FIXME: Mitigation for CVE-2017-5753. To be replaced with a +- * generic mechanism. +- */ +- asm("lfence"); +- +- if (vmcs_field_to_offset_table[field] == 0) ++ field = array_index_nospec(field, size); ++ offset = vmcs_field_to_offset_table[field]; ++ if (offset == 0) + return -ENOENT; +- +- return vmcs_field_to_offset_table[field]; ++ return offset; + } + + static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0014-KVM-nVMX-kmap-can-t-fail.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0014-KVM-nVMX-kmap-can-t-fail.patch new file mode 100644 index 00000000..38a23282 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0014-KVM-nVMX-kmap-can-t-fail.patch @@ -0,0 +1,47 @@ +From 6b359ffcb519698f93eadc2706d06805ce933086 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand <david@redhat.com> +Date: Wed, 25 Jan 2017 11:58:57 +0100 +Subject: [PATCH 14/33] KVM: nVMX: kmap() can't fail + +commit 42cf014d38d8822cce63703a467e00f65d000952 upstream. + +kmap() can't fail, therefore it will always return a valid pointer. Let's +just get rid of the unnecessary checks. + +Signed-off-by: David Hildenbrand <david@redhat.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 9 --------- + 1 file changed, 9 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index af90bc4..17fcbaf 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4742,10 +4742,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + return 0; + + vapic_page = kmap(vmx->nested.virtual_apic_page); +- if (!vapic_page) { +- WARN_ON(1); +- return -ENOMEM; +- } + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); + +@@ -9562,11 +9558,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + return false; + } + msr_bitmap_l1 = (unsigned long *)kmap(page); +- if (!msr_bitmap_l1) { +- nested_release_page_clean(page); +- WARN_ON(1); +- return false; +- } + + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0015-KVM-nVMX-vmx_complete_nested_posted_interrupt-can-t-.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0015-KVM-nVMX-vmx_complete_nested_posted_interrupt-can-t-.patch new file mode 100644 index 00000000..806b1ac0 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0015-KVM-nVMX-vmx_complete_nested_posted_interrupt-can-t-.patch @@ -0,0 +1,69 @@ +From b53c02711255aa79e4e1a9974ca24610c4fbd7d7 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand <david@redhat.com> +Date: Wed, 25 Jan 2017 11:58:58 +0100 +Subject: [PATCH 15/33] KVM: nVMX: vmx_complete_nested_posted_interrupt() can't + fail + +(cherry picked from commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f) + +vmx_complete_nested_posted_interrupt() can't fail, let's turn it into +a void function. + +Signed-off-by: David Hildenbrand <david@redhat.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 17fcbaf..13dc454 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4722,7 +4722,7 @@ static bool vmx_get_enable_apicv(void) + return enable_apicv; + } + +-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) ++static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; +@@ -4733,13 +4733,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) +- return 0; ++ return; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) +- return 0; ++ return; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); +@@ -4752,7 +4752,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + vmcs_write16(GUEST_INTR_STATUS, status); + } + } +- return 0; + } + + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +@@ -10440,7 +10439,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) + return 0; + } + +- return vmx_complete_nested_posted_interrupt(vcpu); ++ vmx_complete_nested_posted_interrupt(vcpu); ++ return 0; + } + + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0016-KVM-nVMX-mark-vmcs12-pages-dirty-on-L2-exit.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0016-KVM-nVMX-mark-vmcs12-pages-dirty-on-L2-exit.patch new file mode 100644 index 00000000..e7f44b1b --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0016-KVM-nVMX-mark-vmcs12-pages-dirty-on-L2-exit.patch @@ -0,0 +1,119 @@ +From 50fefe1aabf115927dbe944d4607d3696ed2773e Mon Sep 17 00:00:00 2001 +From: David Matlack <dmatlack@google.com> +Date: Tue, 1 Aug 2017 14:00:40 -0700 +Subject: [PATCH 16/33] KVM: nVMX: mark vmcs12 pages dirty on L2 exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +(cherry picked from commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570) + +The host physical addresses of L1's Virtual APIC Page and Posted +Interrupt descriptor are loaded into the VMCS02. The CPU may write +to these pages via their host physical address while L2 is running, +bypassing address-translation-based dirty tracking (e.g. EPT write +protection). Mark them dirty on every exit from L2 to prevent them +from getting out of sync with dirty tracking. + +Also mark the virtual APIC page and the posted interrupt descriptor +dirty when KVM is virtualizing posted interrupt processing. + +Signed-off-by: David Matlack <dmatlack@google.com> +Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 43 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 13dc454..2e88fd1 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4722,6 +4722,28 @@ static bool vmx_get_enable_apicv(void) + return enable_apicv; + } + ++static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) ++{ ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); ++ gfn_t gfn; ++ ++ /* ++ * Don't need to mark the APIC access page dirty; it is never ++ * written to by the CPU during APIC virtualization. ++ */ ++ ++ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { ++ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++ ++ if (nested_cpu_has_posted_intr(vmcs12)) { ++ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; ++ kvm_vcpu_mark_page_dirty(vcpu, gfn); ++ } ++} ++ ++ + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); +@@ -4729,18 +4751,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + void *vapic_page; + u16 status; + +- if (vmx->nested.pi_desc && +- vmx->nested.pi_pending) { +- vmx->nested.pi_pending = false; +- if (!pi_test_and_clear_on(vmx->nested.pi_desc)) +- return; +- +- max_irr = find_last_bit( +- (unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) ++ return; + +- if (max_irr == 256) +- return; ++ vmx->nested.pi_pending = false; ++ if (!pi_test_and_clear_on(vmx->nested.pi_desc)) ++ return; + ++ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); ++ if (max_irr != 256) { + vapic_page = kmap(vmx->nested.virtual_apic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); +@@ -4752,6 +4771,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) + vmcs_write16(GUEST_INTR_STATUS, status); + } + } ++ ++ nested_mark_vmcs12_pages_dirty(vcpu); + } + + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +@@ -8009,6 +8030,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + ++ /* ++ * The host physical addresses of some pages of guest memory ++ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU ++ * may write to these pages via their host physical address while ++ * L2 is running, bypassing any address-translation-based dirty ++ * tracking (e.g. EPT write protection). ++ * ++ * Mark them dirty on every exit from L2 to prevent them from ++ * getting out of sync with dirty tracking. ++ */ ++ nested_mark_vmcs12_pages_dirty(vcpu); ++ + if (vmx->nested.nested_run_pending) + return false; + +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0017-KVM-nVMX-Eliminate-vmcs02-pool.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0017-KVM-nVMX-Eliminate-vmcs02-pool.patch new file mode 100644 index 00000000..96687e49 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0017-KVM-nVMX-Eliminate-vmcs02-pool.patch @@ -0,0 +1,295 @@ +From 8e52c41b7072930e5951b324964f31ef6991f3af Mon Sep 17 00:00:00 2001 +From: Jim Mattson <jmattson@google.com> +Date: Mon, 27 Nov 2017 17:22:25 -0600 +Subject: [PATCH 17/33] KVM: nVMX: Eliminate vmcs02 pool +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +(cherry picked from commit de3a0021a60635de96aa92713c1a31a96747d72c) + +The potential performance advantages of a vmcs02 pool have never been +realized. To simplify the code, eliminate the pool. Instead, a single +vmcs02 is allocated per VCPU when the VCPU enters VMX operation. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Signed-off-by: Jim Mattson <jmattson@google.com> +Signed-off-by: Mark Kanda <mark.kanda@oracle.com> +Reviewed-by: Ameya More <ameya.more@oracle.com> +Reviewed-by: David Hildenbrand <david@redhat.com> +Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 146 +++++++++-------------------------------------------- + 1 file changed, 23 insertions(+), 123 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 2e88fd1..099f221 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -174,7 +174,6 @@ module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; + + #define NR_AUTOLOAD_MSRS 8 +-#define VMCS02_POOL_SIZE 1 + + struct vmcs { + u32 revision_id; +@@ -208,7 +207,7 @@ struct shared_msr_entry { + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. +- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the ++ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). +@@ -387,13 +386,6 @@ struct __packed vmcs12 { + */ + #define VMCS12_SIZE 0x1000 + +-/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +-struct vmcs02_list { +- struct list_head list; +- gpa_t vmptr; +- struct loaded_vmcs vmcs02; +-}; +- + /* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. +@@ -420,15 +412,15 @@ struct nested_vmx { + */ + bool sync_shadow_vmcs; + +- /* vmcs02_list cache of VMCSs recently used to run L2 guests */ +- struct list_head vmcs02_pool; +- int vmcs02_num; + bool change_vmcs01_virtual_x2apic_mode; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; ++ ++ struct loaded_vmcs vmcs02; ++ + /* +- * Guest pages referred to in vmcs02 with host-physical pointers, so +- * we must keep them pinned while L2 runs. ++ * Guest pages referred to in the vmcs02 with host-physical ++ * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; +@@ -6657,94 +6649,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) + } + + /* +- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. +- * We could reuse a single VMCS for all the L2 guests, but we also want the +- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this +- * allows keeping them loaded on the processor, and in the future will allow +- * optimizations where prepare_vmcs02 doesn't need to set all the fields on +- * every entry if they never change. +- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE +- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. +- * +- * The following functions allocate and free a vmcs02 in this pool. +- */ +- +-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ +-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmx->nested.current_vmptr) { +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { +- /* Recycle the least recently used VMCS. */ +- item = list_last_entry(&vmx->nested.vmcs02_pool, +- struct vmcs02_list, list); +- item->vmptr = vmx->nested.current_vmptr; +- list_move(&item->list, &vmx->nested.vmcs02_pool); +- return &item->vmcs02; +- } +- +- /* Create a new VMCS */ +- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); +- if (!item) +- return NULL; +- item->vmcs02.vmcs = alloc_vmcs(); +- item->vmcs02.shadow_vmcs = NULL; +- if (!item->vmcs02.vmcs) { +- kfree(item); +- return NULL; +- } +- loaded_vmcs_init(&item->vmcs02); +- item->vmptr = vmx->nested.current_vmptr; +- list_add(&(item->list), &(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num++; +- return &item->vmcs02; +-} +- +-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ +-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) +-{ +- struct vmcs02_list *item; +- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) +- if (item->vmptr == vmptr) { +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- return; +- } +-} +- +-/* +- * Free all VMCSs saved for this vcpu, except the one pointed by +- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs +- * must be &vmx->vmcs01. +- */ +-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) +-{ +- struct vmcs02_list *item, *n; +- +- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); +- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { +- /* +- * Something will leak if the above WARN triggers. Better than +- * a use-after-free. +- */ +- if (vmx->loaded_vmcs == &item->vmcs02) +- continue; +- +- free_loaded_vmcs(&item->vmcs02); +- list_del(&item->list); +- kfree(item); +- vmx->nested.vmcs02_num--; +- } +-} +- +-/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". +@@ -7051,6 +6955,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + return 1; + } + ++ vmx->nested.vmcs02.vmcs = alloc_vmcs(); ++ vmx->nested.vmcs02.shadow_vmcs = NULL; ++ if (!vmx->nested.vmcs02.vmcs) ++ goto out_vmcs02; ++ loaded_vmcs_init(&vmx->nested.vmcs02); ++ + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); +@@ -7073,9 +6983,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + vmx->vmcs01.shadow_vmcs = shadow_vmcs; + } + +- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); +- vmx->nested.vmcs02_num = 0; +- + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; +@@ -7093,6 +7000,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + free_page((unsigned long)vmx->nested.msr_bitmap); + + out_msr_bitmap: ++ free_loaded_vmcs(&vmx->nested.vmcs02); ++ ++out_vmcs02: + return -ENOMEM; + } + +@@ -7178,7 +7088,7 @@ static void free_nested(struct vcpu_vmx *vmx) + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); +- /* Unpin physical memory we referred to in current vmcs02 */ ++ /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; +@@ -7194,7 +7104,7 @@ static void free_nested(struct vcpu_vmx *vmx) + vmx->nested.pi_desc = NULL; + } + +- nested_free_all_saved_vmcss(vmx); ++ free_loaded_vmcs(&vmx->nested.vmcs02); + } + + /* Emulate the VMXOFF instruction */ +@@ -7242,8 +7152,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) + kunmap(page); + nested_release_page(page); + +- nested_free_vmcs02(vmx, vmptr); +- + skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); + return 1; +@@ -8032,10 +7940,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) + + /* + * The host physical addresses of some pages of guest memory +- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU +- * may write to these pages via their host physical address while +- * L2 is running, bypassing any address-translation-based dirty +- * tracking (e.g. EPT write protection). ++ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC ++ * Page). The CPU may write to these pages via their host ++ * physical address while L2 is running, bypassing any ++ * address-translation-based dirty tracking (e.g. EPT write ++ * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. +@@ -10170,7 +10079,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; +- struct loaded_vmcs *vmcs02; + bool ia32e; + u32 msr_entry_idx; + +@@ -10310,17 +10218,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) + * the nested entry. + */ + +- vmcs02 = nested_get_current_vmcs02(vmx); +- if (!vmcs02) +- return -ENOMEM; +- + enter_guest_mode(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + + cpu = get_cpu(); +- vmx->loaded_vmcs = vmcs02; ++ vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; +@@ -10833,10 +10737,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); + +- /* if no vmcs02 cache requested, remove the one we used */ +- if (VMCS02_POOL_SIZE == 0) +- nested_free_vmcs02(vmx, vmx->nested.current_vmptr); +- + load_vmcs12_host_state(vcpu, vmcs12); + + /* Update any VMCS fields that might have changed while L2 ran */ +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0018-KVM-VMX-introduce-alloc_loaded_vmcs.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0018-KVM-VMX-introduce-alloc_loaded_vmcs.patch new file mode 100644 index 00000000..a22f91a8 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0018-KVM-VMX-introduce-alloc_loaded_vmcs.patch @@ -0,0 +1,104 @@ +From 80f4f0e9de9cce1047ac0aac305aca7310e37313 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Thu, 11 Jan 2018 12:16:15 +0100 +Subject: [PATCH 18/33] KVM: VMX: introduce alloc_loaded_vmcs + +(cherry picked from commit f21f165ef922c2146cc5bdc620f542953c41714b) + +Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also +allocate an MSR bitmap there. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 15 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 099f221..6814355 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -3514,11 +3514,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) + return vmcs; + } + +-static struct vmcs *alloc_vmcs(void) +-{ +- return alloc_vmcs_cpu(raw_smp_processor_id()); +-} +- + static void free_vmcs(struct vmcs *vmcs) + { + free_pages((unsigned long)vmcs, vmcs_config.order); +@@ -3537,6 +3532,22 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); + } + ++static struct vmcs *alloc_vmcs(void) ++{ ++ return alloc_vmcs_cpu(raw_smp_processor_id()); ++} ++ ++static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) ++{ ++ loaded_vmcs->vmcs = alloc_vmcs(); ++ if (!loaded_vmcs->vmcs) ++ return -ENOMEM; ++ ++ loaded_vmcs->shadow_vmcs = NULL; ++ loaded_vmcs_init(loaded_vmcs); ++ return 0; ++} ++ + static void free_kvm_area(void) + { + int cpu; +@@ -6916,6 +6927,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; ++ int r; + + /* The Intel VMX Instruction Reference lists a bunch of bits that + * are prerequisite to running VMXON, most notably cr4.VMXE must be +@@ -6955,11 +6967,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + return 1; + } + +- vmx->nested.vmcs02.vmcs = alloc_vmcs(); +- vmx->nested.vmcs02.shadow_vmcs = NULL; +- if (!vmx->nested.vmcs02.vmcs) ++ r = alloc_loaded_vmcs(&vmx->nested.vmcs02); ++ if (r < 0) + goto out_vmcs02; +- loaded_vmcs_init(&vmx->nested.vmcs02); + + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = +@@ -9090,17 +9100,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) + if (!vmx->guest_msrs) + goto free_pml; + +- vmx->loaded_vmcs = &vmx->vmcs01; +- vmx->loaded_vmcs->vmcs = alloc_vmcs(); +- vmx->loaded_vmcs->shadow_vmcs = NULL; +- if (!vmx->loaded_vmcs->vmcs) +- goto free_msrs; + if (!vmm_exclusive) + kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); +- loaded_vmcs_init(vmx->loaded_vmcs); ++ err = alloc_loaded_vmcs(&vmx->vmcs01); + if (!vmm_exclusive) + kvm_cpu_vmxoff(); ++ if (err < 0) ++ goto free_msrs; + ++ vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0019-KVM-VMX-make-MSR-bitmaps-per-VCPU.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0019-KVM-VMX-make-MSR-bitmaps-per-VCPU.patch new file mode 100644 index 00000000..0a8db555 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0019-KVM-VMX-make-MSR-bitmaps-per-VCPU.patch @@ -0,0 +1,585 @@ +From cc42f184dfdfed46c394274020b84a1641f24714 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Tue, 16 Jan 2018 16:51:18 +0100 +Subject: [PATCH 19/33] KVM: VMX: make MSR bitmaps per-VCPU + +(cherry picked from commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6) + +Place the MSR bitmap in struct loaded_vmcs, and update it in place +every time the x2apic or APICv state can change. This is rare and +the loop can handle 64 MSRs per iteration, in a similar fashion as +nested_vmx_prepare_msr_bitmap. + +This prepares for choosing, on a per-VM basis, whether to intercept +the SPEC_CTRL and PRED_CMD MSRs. + +Cc: stable@vger.kernel.org # prereq for Spectre mitigation +Suggested-by: Jim Mattson <jmattson@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 315 +++++++++++++++++++---------------------------------- + 1 file changed, 114 insertions(+), 201 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 6814355..c6a7563 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -110,6 +110,14 @@ static u64 __read_mostly host_xss; + static bool __read_mostly enable_pml = 1; + module_param_named(pml, enable_pml, bool, S_IRUGO); + ++#define MSR_TYPE_R 1 ++#define MSR_TYPE_W 2 ++#define MSR_TYPE_RW 3 ++ ++#define MSR_BITMAP_MODE_X2APIC 1 ++#define MSR_BITMAP_MODE_X2APIC_APICV 2 ++#define MSR_BITMAP_MODE_LM 4 ++ + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + + /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +@@ -191,6 +199,7 @@ struct loaded_vmcs { + struct vmcs *shadow_vmcs; + int cpu; + int launched; ++ unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + }; + +@@ -429,8 +438,6 @@ struct nested_vmx { + bool pi_pending; + u16 posted_intr_nv; + +- unsigned long *msr_bitmap; +- + struct hrtimer preemption_timer; + bool preemption_timer_expired; + +@@ -531,6 +538,7 @@ struct vcpu_vmx { + unsigned long host_rsp; + u8 fail; + bool nmi_known_unmasked; ++ u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; +@@ -902,6 +910,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); + static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -921,12 +930,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + + static unsigned long *vmx_io_bitmap_a; + static unsigned long *vmx_io_bitmap_b; +-static unsigned long *vmx_msr_bitmap_legacy; +-static unsigned long *vmx_msr_bitmap_longmode; +-static unsigned long *vmx_msr_bitmap_legacy_x2apic; +-static unsigned long *vmx_msr_bitmap_longmode_x2apic; +-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + static unsigned long *vmx_vmread_bitmap; + static unsigned long *vmx_vmwrite_bitmap; + +@@ -2517,36 +2520,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) + vmx->guest_msrs[from] = tmp; + } + +-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +-{ +- unsigned long *msr_bitmap; +- +- if (is_guest_mode(vcpu)) +- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; +- else if (cpu_has_secondary_exec_ctrls() && +- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & +- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { +- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic; +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; +- else +- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +- } +- } else { +- if (is_long_mode(vcpu)) +- msr_bitmap = vmx_msr_bitmap_longmode; +- else +- msr_bitmap = vmx_msr_bitmap_legacy; +- } +- +- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +-} +- + /* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy +@@ -2587,7 +2560,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(&vmx->vcpu); ++ vmx_update_msr_bitmap(&vmx->vcpu); + } + + /* +@@ -3529,6 +3502,8 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; ++ if (loaded_vmcs->msr_bitmap) ++ free_page((unsigned long)loaded_vmcs->msr_bitmap); + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); + } + +@@ -3545,7 +3520,18 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs_init(loaded_vmcs); ++ ++ if (cpu_has_vmx_msr_bitmap()) { ++ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); ++ if (!loaded_vmcs->msr_bitmap) ++ goto out_vmcs; ++ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); ++ } + return 0; ++ ++out_vmcs: ++ free_loaded_vmcs(loaded_vmcs); ++ return -ENOMEM; + } + + static void free_kvm_area(void) +@@ -4548,10 +4534,8 @@ static void free_vpid(int vpid) + spin_unlock(&vmx_vpid_lock); + } + +-#define MSR_TYPE_R 1 +-#define MSR_TYPE_W 2 +-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4585,8 +4569,8 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + } + } + +-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +- u32 msr, int type) ++static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type) + { + int f = sizeof(unsigned long); + +@@ -4620,6 +4604,15 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + } + } + ++static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type, bool value) ++{ ++ if (value) ++ vmx_enable_intercept_for_msr(msr_bitmap, msr, type); ++ else ++ vmx_disable_intercept_for_msr(msr_bitmap, msr, type); ++} ++ + /* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. +@@ -4666,58 +4659,68 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + } + } + +-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) ++static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) + { +- if (!longmode_only) +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +- msr, MSR_TYPE_R | MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +- msr, MSR_TYPE_R | MSR_TYPE_W); +-} ++ u8 mode = 0; + +-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +-{ +- if (apicv_active) { +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); +- } else { +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); +- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); ++ if (cpu_has_secondary_exec_ctrls() && ++ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & ++ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { ++ mode |= MSR_BITMAP_MODE_X2APIC; ++ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) ++ mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } ++ ++ if (is_long_mode(vcpu)) ++ mode |= MSR_BITMAP_MODE_LM; ++ ++ return mode; + } + +-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) ++#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) ++ ++static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, ++ u8 mode) + { +- if (apicv_active) { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_R); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_R); +- } else { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_R); ++ int msr; ++ ++ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { ++ unsigned word = msr / BITS_PER_LONG; ++ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; ++ msr_bitmap[word + (0x800 / sizeof(long))] = ~0; ++ } ++ ++ if (mode & MSR_BITMAP_MODE_X2APIC) { ++ /* ++ * TPR reads and writes can be virtualized even if virtual interrupt ++ * delivery is not in use. ++ */ ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); ++ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { ++ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); ++ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); ++ } + } + } + +-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) ++static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) + { +- if (apicv_active) { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, +- msr, MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, +- msr, MSR_TYPE_W); +- } else { +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- msr, MSR_TYPE_W); +- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- msr, MSR_TYPE_W); +- } ++ struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; ++ u8 mode = vmx_msr_bitmap_mode(vcpu); ++ u8 changed = mode ^ vmx->msr_bitmap_mode; ++ ++ if (!changed) ++ return; ++ ++ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, ++ !(mode & MSR_BITMAP_MODE_LM)); ++ ++ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) ++ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); ++ ++ vmx->msr_bitmap_mode = mode; + } + + static bool vmx_get_enable_apicv(void) +@@ -4953,7 +4956,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) + } + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static u32 vmx_exec_control(struct vcpu_vmx *vmx) +@@ -5042,7 +5045,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } + if (cpu_has_vmx_msr_bitmap()) +- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); ++ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + +@@ -6371,7 +6374,7 @@ static void wakeup_handler(void) + + static __init int hardware_setup(void) + { +- int r = -ENOMEM, i, msr; ++ int r = -ENOMEM, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + +@@ -6386,41 +6389,13 @@ static __init int hardware_setup(void) + if (!vmx_io_bitmap_b) + goto out; + +- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy) +- goto out1; +- +- vmx_msr_bitmap_legacy_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy_x2apic) +- goto out2; +- +- vmx_msr_bitmap_legacy_x2apic_apicv_inactive = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) +- goto out3; +- +- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode) +- goto out4; +- +- vmx_msr_bitmap_longmode_x2apic = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode_x2apic) +- goto out5; +- +- vmx_msr_bitmap_longmode_x2apic_apicv_inactive = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) +- goto out6; +- + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) +- goto out7; ++ goto out1; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) +- goto out8; ++ goto out2; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); +@@ -6434,12 +6409,9 @@ static __init int hardware_setup(void) + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + +- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); +- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); +- + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; +- goto out9; ++ goto out3; + } + + if (boot_cpu_has(X86_FEATURE_NX)) +@@ -6494,48 +6466,8 @@ static __init int hardware_setup(void) + kvm_tsc_scaling_ratio_frac_bits = 48; + } + +- vmx_disable_intercept_for_msr(MSR_FS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_GS_BASE, false); +- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); +- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); +- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); +- +- memcpy(vmx_msr_bitmap_legacy_x2apic, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, +- vmx_msr_bitmap_legacy, PAGE_SIZE); +- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, +- vmx_msr_bitmap_longmode, PAGE_SIZE); +- + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + +- /* +- * enable_apicv && kvm_vcpu_apicv_active() +- */ +- for (msr = 0x800; msr <= 0x8ff; msr++) +- vmx_disable_intercept_msr_read_x2apic(msr, true); +- +- /* TMCCT */ +- vmx_enable_intercept_msr_read_x2apic(0x839, true); +- /* TPR */ +- vmx_disable_intercept_msr_write_x2apic(0x808, true); +- /* EOI */ +- vmx_disable_intercept_msr_write_x2apic(0x80b, true); +- /* SELF-IPI */ +- vmx_disable_intercept_msr_write_x2apic(0x83f, true); +- +- /* +- * (enable_apicv && !kvm_vcpu_apicv_active()) || +- * !enable_apicv +- */ +- /* TPR */ +- vmx_disable_intercept_msr_read_x2apic(0x808, false); +- vmx_disable_intercept_msr_write_x2apic(0x808, false); +- + if (enable_ept) { + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, +@@ -6581,22 +6513,10 @@ static __init int hardware_setup(void) + + return alloc_kvm_area(); + +-out9: +- free_page((unsigned long)vmx_vmwrite_bitmap); +-out8: +- free_page((unsigned long)vmx_vmread_bitmap); +-out7: +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); +-out6: +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +-out5: +- free_page((unsigned long)vmx_msr_bitmap_longmode); +-out4: +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); + out3: +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); ++ free_page((unsigned long)vmx_vmwrite_bitmap); + out2: +- free_page((unsigned long)vmx_msr_bitmap_legacy); ++ free_page((unsigned long)vmx_vmread_bitmap); + out1: + free_page((unsigned long)vmx_io_bitmap_b); + out: +@@ -6607,12 +6527,6 @@ static __init int hardware_setup(void) + + static __exit void hardware_unsetup(void) + { +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); +- free_page((unsigned long)vmx_msr_bitmap_legacy); +- free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); +@@ -6971,13 +6885,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + if (r < 0) + goto out_vmcs02; + +- if (cpu_has_vmx_msr_bitmap()) { +- vmx->nested.msr_bitmap = +- (unsigned long *)__get_free_page(GFP_KERNEL); +- if (!vmx->nested.msr_bitmap) +- goto out_msr_bitmap; +- } +- + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; +@@ -7007,9 +6914,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) + kfree(vmx->nested.cached_vmcs12); + + out_cached_vmcs12: +- free_page((unsigned long)vmx->nested.msr_bitmap); +- +-out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + + out_vmcs02: +@@ -7088,10 +6992,6 @@ static void free_nested(struct vcpu_vmx *vmx) + vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); + nested_release_vmcs12(vmx); +- if (vmx->nested.msr_bitmap) { +- free_page((unsigned long)vmx->nested.msr_bitmap); +- vmx->nested.msr_bitmap = NULL; +- } + if (enable_shadow_vmcs) { + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); +@@ -8450,7 +8350,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + } + + static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +@@ -9068,6 +8968,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) + { + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); ++ unsigned long *msr_bitmap; + int cpu; + + if (!vmx) +@@ -9108,6 +9009,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) + if (err < 0) + goto free_msrs; + ++ msr_bitmap = vmx->vmcs01.msr_bitmap; ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); ++ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); ++ vmx->msr_bitmap_mode = 0; ++ + vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); +@@ -9495,7 +9405,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; +- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; ++ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + + /* This shortcut is ok because we support only x2APIC MSRs so far. */ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +@@ -10007,6 +9917,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + ++ if (cpu_has_vmx_msr_bitmap()) ++ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); ++ + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the +@@ -10694,7 +10607,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) +- vmx_set_msr_bitmap(vcpu); ++ vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-KVM-x86-Add-IBPB-support.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-KVM-x86-Add-IBPB-support.patch new file mode 100644 index 00000000..731a182a --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-KVM-x86-Add-IBPB-support.patch @@ -0,0 +1,352 @@ +From b70d7889c078c97d11ae6412760f3231fda324cd Mon Sep 17 00:00:00 2001 +From: Ashok Raj <ashok.raj@intel.com> +Date: Thu, 1 Feb 2018 22:59:43 +0100 +Subject: [PATCH 20/33] KVM/x86: Add IBPB support + +(cherry picked from commit 15d45071523d89b3fb7372e2135fbd72f6af9506) + +The Indirect Branch Predictor Barrier (IBPB) is an indirect branch +control mechanism. It keeps earlier branches from influencing +later ones. + +Unlike IBRS and STIBP, IBPB does not define a new mode of operation. +It's a command that ensures predicted branch targets aren't used after +the barrier. Although IBRS and IBPB are enumerated by the same CPUID +enumeration, IBPB is very different. + +IBPB helps mitigate against three potential attacks: + +* Mitigate guests from being attacked by other guests. + - This is addressed by issing IBPB when we do a guest switch. + +* Mitigate attacks from guest/ring3->host/ring3. + These would require a IBPB during context switch in host, or after + VMEXIT. The host process has two ways to mitigate + - Either it can be compiled with retpoline + - If its going through context switch, and has set !dumpable then + there is a IBPB in that path. + (Tim's patch: https://patchwork.kernel.org/patch/10192871) + - The case where after a VMEXIT you return back to Qemu might make + Qemu attackable from guest when Qemu isn't compiled with retpoline. + There are issues reported when doing IBPB on every VMEXIT that resulted + in some tsc calibration woes in guest. + +* Mitigate guest/ring0->host/ring0 attacks. + When host kernel is using retpoline it is safe against these attacks. + If host kernel isn't using retpoline we might need to do a IBPB flush on + every VMEXIT. + +Even when using retpoline for indirect calls, in certain conditions 'ret' +can use the BTB on Skylake-era CPUs. There are other mitigations +available like RSB stuffing/clearing. + +* IBPB is issued only for SVM during svm_free_vcpu(). + VMX has a vmclear and SVM doesn't. Follow discussion here: + https://lkml.org/lkml/2018/1/15/146 + +Please refer to the following spec for more details on the enumeration +and control. + +Refer here to get documentation about mitigations. + +https://software.intel.com/en-us/side-channel-security-support + +[peterz: rebase and changelog rewrite] +[karahmed: - rebase + - vmx: expose PRED_CMD if guest has it in CPUID + - svm: only pass through IBPB if guest has it in CPUID + - vmx: support !cpu_has_vmx_msr_bitmap()] + - vmx: support nested] +[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS) + PRED_CMD is a write-only MSR] + +Signed-off-by: Ashok Raj <ashok.raj@intel.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: kvm@vger.kernel.org +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com +Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/cpuid.c | 11 +++++++- + arch/x86/kvm/cpuid.h | 12 ++++++++ + arch/x86/kvm/svm.c | 28 +++++++++++++++++++ + arch/x86/kvm/vmx.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++-- + 4 files changed, 127 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index afa7bbb..42323be 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + ++ /* cpuid 0x80000008.ebx */ ++ const u32 kvm_cpuid_8000_0008_ebx_x86_features = ++ F(IBPB); ++ + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | +@@ -607,7 +611,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); +- entry->ebx = entry->edx = 0; ++ entry->edx = 0; ++ /* IBPB isn't necessarily present in hardware cpuid */ ++ if (boot_cpu_has(X86_FEATURE_IBPB)) ++ entry->ebx |= F(IBPB); ++ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; + } + case 0x80000019: +diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h +index 35058c2..f4a2a1a 100644 +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -152,6 +152,18 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); + } + ++static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBPB))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ ++ + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 491f077..43e45b9 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -248,6 +248,7 @@ static const struct svm_direct_access_msrs { + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, +@@ -510,6 +511,7 @@ struct svm_cpu_data { + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; ++ struct vmcb *current_vmcb; + }; + + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +@@ -1641,11 +1643,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); ++ /* ++ * The vmcb page can be recycled, causing a false negative in ++ * svm_vcpu_load(). So do a full IBPB now. ++ */ ++ indirect_branch_prediction_barrier(); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { +@@ -1674,6 +1682,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + ++ if (sd->current_vmcb != svm->vmcb) { ++ sd->current_vmcb = svm->vmcb; ++ indirect_branch_prediction_barrier(); ++ } + avic_vcpu_load(vcpu, cpu); + } + +@@ -3587,6 +3599,22 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ if (is_guest_mode(vcpu)) ++ break; ++ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); ++ break; + case MSR_STAR: + svm->vmcb->save.star = data; + break; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index c6a7563..855df75 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -550,6 +550,7 @@ struct vcpu_vmx { + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; + #endif ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -911,6 +912,8 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); + static int alloc_identity_pagetable(struct kvm *kvm); + static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); ++static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, ++ u32 msr, int type); + + static DEFINE_PER_CPU(struct vmcs *, vmxarea); + static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +@@ -1841,6 +1844,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) + vmcs_write32(EXCEPTION_BITMAP, eb); + } + ++/* ++ * Check if MSR is intercepted for L01 MSR bitmap. ++ */ ++static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) + { +@@ -2252,6 +2278,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); ++ indirect_branch_prediction_barrier(); + } + + if (!already_loaded) { +@@ -3048,6 +3075,33 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_PRED_CMD: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibpb(vcpu)) ++ return 1; ++ ++ if (data & ~PRED_CMD_IBPB) ++ return 1; ++ ++ if (!data) ++ break; ++ ++ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -9406,9 +9460,23 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; ++ /* ++ * pred_cmd is trying to verify two things: ++ * ++ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This ++ * ensures that we do not accidentally generate an L02 MSR bitmap ++ * from the L12 MSR bitmap that is too permissive. ++ * 2. That L1 or L2s have actually used the MSR. This avoids ++ * unnecessarily merging of the bitmap if the MSR is unused. This ++ * works properly because we only update the L01 MSR bitmap lazily. ++ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only ++ * updated to reflect this when L1 (or its L2s) actually write to ++ * the MSR. ++ */ ++ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + +- /* This shortcut is ok because we support only x2APIC MSRs so far. */ +- if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) ++ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && ++ !pred_cmd) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9443,6 +9511,13 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + MSR_TYPE_W); + } + } ++ ++ if (pred_cmd) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_PRED_CMD, ++ MSR_TYPE_W); ++ + kunmap(page); + nested_release_page_clean(page); + +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0021-KVM-VMX-Emulate-MSR_IA32_ARCH_CAPABILITIES.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0021-KVM-VMX-Emulate-MSR_IA32_ARCH_CAPABILITIES.patch new file mode 100644 index 00000000..538a1137 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0021-KVM-VMX-Emulate-MSR_IA32_ARCH_CAPABILITIES.patch @@ -0,0 +1,156 @@ +From dc7636423649302a329856f238df8820b9c7dc28 Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed <karahmed@amazon.de> +Date: Thu, 1 Feb 2018 22:59:44 +0100 +Subject: [PATCH 21/33] KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES + +(cherry picked from commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd) + +Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO +(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the +contents will come directly from the hardware, but user-space can still +override it. + +[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional] + +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> +Reviewed-by: Darren Kenny <darren.kenny@oracle.com> +Reviewed-by: Jim Mattson <jmattson@google.com> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: kvm@vger.kernel.org +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/cpuid.c | 8 +++++++- + arch/x86/kvm/cpuid.h | 8 ++++++++ + arch/x86/kvm/vmx.c | 15 +++++++++++++++ + arch/x86/kvm/x86.c | 1 + + 4 files changed, 31 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index 42323be..4d3555b 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + ++ /* cpuid 7.0.edx*/ ++ const u32 kvm_cpuid_7_0_edx_x86_features = ++ F(ARCH_CAPABILITIES); ++ + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + +@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled) + entry->ecx &= ~F(PKU); ++ entry->edx &= kvm_cpuid_7_0_edx_x86_features; ++ cpuid_mask(&entry->edx, CPUID_7_EDX); + } else { + entry->ebx = 0; + entry->ecx = 0; ++ entry->edx = 0; + } + entry->eax = 0; +- entry->edx = 0; + break; + } + case 9: +diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h +index f4a2a1a..a69906c 100644 +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -163,6 +163,14 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); ++} ++ + + /* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 855df75..d8e3c02 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -551,6 +551,8 @@ struct vcpu_vmx { + u64 msr_guest_kernel_gs_base; + #endif + ++ u64 arch_capabilities; ++ + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + /* +@@ -2976,6 +2978,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_arch_capabilities(vcpu)) ++ return 1; ++ msr_info->data = to_vmx(vcpu)->arch_capabilities; ++ break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; +@@ -3102,6 +3110,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; ++ case MSR_IA32_ARCH_CAPABILITIES: ++ if (!msr_info->host_initiated) ++ return 1; ++ vmx->arch_capabilities = data; ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -5173,6 +5186,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) + ++vmx->nmsrs; + } + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index abbb37a..d01742e 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -975,6 +975,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, ++ MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0022-KVM-VMX-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0022-KVM-VMX-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch new file mode 100644 index 00000000..9a833616 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0022-KVM-VMX-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch @@ -0,0 +1,305 @@ +From 3a5351279f63e7822bbfe5c0f4ee3d5a1a5bced1 Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed <karahmed@amazon.de> +Date: Thu, 1 Feb 2018 22:59:45 +0100 +Subject: [PATCH 22/33] KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL + +(cherry picked from commit d28b387fb74da95d69d2615732f50cceb38e9a4d) + +[ Based on a patch from Ashok Raj <ashok.raj@intel.com> ] + +Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for +guests that will only mitigate Spectre V2 through IBRS+IBPB and will not +be using a retpoline+IBPB based approach. + +To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for +guests that do not actually use the MSR, only start saving and restoring +when a non-zero is written to it. + +No attempt is made to handle STIBP here, intentionally. Filtering STIBP +may be added in a future patch, which may require trapping all writes +if we don't want to pass it through directly to the guest. + +[dwmw2: Clean up CPUID bits, save/restore manually, handle reset] + +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Darren Kenny <darren.kenny@oracle.com> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Reviewed-by: Jim Mattson <jmattson@google.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: kvm@vger.kernel.org +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Ashok Raj <ashok.raj@intel.com> +Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/cpuid.c | 8 ++-- + arch/x86/kvm/cpuid.h | 11 ++++++ + arch/x86/kvm/vmx.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++- + arch/x86/kvm/x86.c | 2 +- + 4 files changed, 118 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index 4d3555b..bcebe84 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = +- F(IBPB); ++ F(IBPB) | F(IBRS); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_cpuid_C000_0001_edx_x86_features = +@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- F(ARCH_CAPABILITIES); ++ F(SPEC_CTRL) | F(ARCH_CAPABILITIES); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->edx = 0; +- /* IBPB isn't necessarily present in hardware cpuid */ ++ /* IBRS and IBPB aren't necessarily present in hardware cpuid */ + if (boot_cpu_has(X86_FEATURE_IBPB)) + entry->ebx |= F(IBPB); ++ if (boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->ebx |= F(IBRS); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + break; +diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h +index a69906c..841e80d 100644 +--- a/arch/x86/kvm/cpuid.h ++++ b/arch/x86/kvm/cpuid.h +@@ -163,6 +163,17 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + } + ++static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) ++{ ++ struct kvm_cpuid_entry2 *best; ++ ++ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); ++ if (best && (best->ebx & bit(X86_FEATURE_IBRS))) ++ return true; ++ best = kvm_find_cpuid_entry(vcpu, 7, 0); ++ return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); ++} ++ + static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) + { + struct kvm_cpuid_entry2 *best; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index d8e3c02..c564d03 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -552,6 +552,7 @@ struct vcpu_vmx { + #endif + + u64 arch_capabilities; ++ u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; +@@ -1847,6 +1848,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) + } + + /* ++ * Check if MSR is intercepted for currently loaded MSR bitmap. ++ */ ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) ++{ ++ unsigned long *msr_bitmap; ++ int f = sizeof(unsigned long); ++ ++ if (!cpu_has_vmx_msr_bitmap()) ++ return true; ++ ++ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; ++ ++ if (msr <= 0x1fff) { ++ return !!test_bit(msr, msr_bitmap + 0x800 / f); ++ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { ++ msr &= 0x1fff; ++ return !!test_bit(msr, msr_bitmap + 0xc00 / f); ++ } ++ ++ return true; ++} ++ ++/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ + static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +@@ -2978,6 +3002,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = to_vmx(vcpu)->spec_ctrl; ++ break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has_arch_capabilities(vcpu)) +@@ -3083,6 +3114,36 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ vmx->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_vmx_merge_msr_bitmap. We should not touch the ++ * vmcs02.msr_bitmap here since it gets completely overwritten ++ * in the merging. We update the vmcs01 here for L1 as well ++ * since it will end up touching the MSR anyway now. ++ */ ++ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_RW); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -5216,6 +5277,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + u64 cr0; + + vmx->rmode.vm86_active = 0; ++ vmx->spec_ctrl = 0; + + vmx->soft_vnmi_blocked = 0; + +@@ -8806,6 +8868,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + + vmx_arm_hv_timer(vcpu); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ + vmx->__launched = vmx->loaded_vmcs->launched; + asm( + /* Store host registers */ +@@ -8924,6 +8995,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ ++ if (vmx->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +@@ -9476,7 +9568,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* +- * pred_cmd is trying to verify two things: ++ * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap +@@ -9489,9 +9581,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + * the MSR. + */ + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); ++ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && +- !pred_cmd) ++ !pred_cmd && !spec_ctrl) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); +@@ -9527,6 +9620,12 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + } + } + ++ if (spec_ctrl) ++ nested_vmx_disable_intercept_for_msr( ++ msr_bitmap_l1, msr_bitmap_l0, ++ MSR_IA32_SPEC_CTRL, ++ MSR_TYPE_R | MSR_TYPE_W); ++ + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index d01742e..d2ea523 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -975,7 +975,7 @@ static u32 msrs_to_save[] = { + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, +- MSR_IA32_ARCH_CAPABILITIES ++ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + }; + + static unsigned num_msrs_to_save; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0023-KVM-SVM-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0023-KVM-SVM-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch new file mode 100644 index 00000000..905134c7 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0023-KVM-SVM-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch @@ -0,0 +1,192 @@ +From c8b2b4bc3e5eddb48f6eda57e9138a2ea2d39345 Mon Sep 17 00:00:00 2001 +From: KarimAllah Ahmed <karahmed@amazon.de> +Date: Sat, 3 Feb 2018 15:56:23 +0100 +Subject: [PATCH 23/33] KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL + +(cherry picked from commit b2ac58f90540e39324e7a29a7ad471407ae0bf48) + +[ Based on a patch from Paolo Bonzini <pbonzini@redhat.com> ] + +... basically doing exactly what we do for VMX: + +- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID) +- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest + actually used it. + +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Darren Kenny <darren.kenny@oracle.com> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: kvm@vger.kernel.org +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Ashok Raj <ashok.raj@intel.com> +Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/svm.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 88 insertions(+) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 43e45b9..4a36977 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -183,6 +183,8 @@ struct vcpu_svm { + u64 gs_base; + } host; + ++ u64 spec_ctrl; ++ + u32 *msrpm; + + ulong nmi_iret_rip; +@@ -248,6 +250,7 @@ static const struct svm_direct_access_msrs { + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_SPEC_CTRL, .always = false }, + { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, +@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 index) + return false; + } + ++static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) ++{ ++ u8 bit_write; ++ unsigned long tmp; ++ u32 offset; ++ u32 *msrpm; ++ ++ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: ++ to_svm(vcpu)->msrpm; ++ ++ offset = svm_msrpm_offset(msr); ++ bit_write = 2 * (msr & 0x0f) + 1; ++ tmp = msrpm[offset]; ++ ++ BUG_ON(offset == MSR_INVALID); ++ ++ return !!test_bit(bit_write, &tmp); ++} ++ + static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) + { +@@ -1534,6 +1556,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) + u32 dummy; + u32 eax = 1; + ++ svm->spec_ctrl = 0; ++ + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; +@@ -3515,6 +3539,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_VM_CR: + msr_info->data = svm->nested.vm_cr_msr; + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ msr_info->data = svm->spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3599,6 +3630,33 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); + break; ++ case MSR_IA32_SPEC_CTRL: ++ if (!msr->host_initiated && ++ !guest_cpuid_has_ibrs(vcpu)) ++ return 1; ++ ++ /* The STIBP bit doesn't fault even if it's not advertised */ ++ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) ++ return 1; ++ ++ svm->spec_ctrl = data; ++ ++ if (!data) ++ break; ++ ++ /* ++ * For non-nested: ++ * When it's written (to non-zero) for the first time, pass ++ * it through. ++ * ++ * For nested: ++ * The handling of the MSR bitmap for L2 guests is done in ++ * nested_svm_vmrun_msrpm. ++ * We update the L1 MSR bit as well since it will end up ++ * touching the MSR anyway now. ++ */ ++ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); ++ break; + case MSR_IA32_PRED_CMD: + if (!msr->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) +@@ -4842,6 +4900,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + + local_irq_enable(); + ++ /* ++ * If this vCPU has touched SPEC_CTRL, restore the guest's value if ++ * it's non-zero. Since vmentry is serialising on affected CPUs, there ++ * is no need to worry about the conditional branch over the wrmsr ++ * being speculatively taken. ++ */ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ + asm volatile ( + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +@@ -4934,6 +5001,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* ++ * We do not use IBRS in the kernel. If this vCPU has used the ++ * SPEC_CTRL MSR it may have left it on; save the value and ++ * turn it off. This is much more efficient than blindly adding ++ * it to the atomic save/restore list. Especially as the former ++ * (Saving guest MSRs on vmexit) doesn't even exist in KVM. ++ * ++ * For non-nested case: ++ * If the L01 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ * ++ * For nested case: ++ * If the L02 MSR bitmap does not intercept the MSR, then we need to ++ * save it. ++ */ ++ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ ++ if (svm->spec_ctrl) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0024-KVM-nVMX-Fix-races-when-sending-nested-PI-while-dest.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0024-KVM-nVMX-Fix-races-when-sending-nested-PI-while-dest.patch new file mode 100644 index 00000000..8feed73a --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0024-KVM-nVMX-Fix-races-when-sending-nested-PI-while-dest.patch @@ -0,0 +1,100 @@ +From 36417bad8e288e64df1067207030c67304c26ee5 Mon Sep 17 00:00:00 2001 +From: Liran Alon <liran.alon@oracle.com> +Date: Thu, 9 Nov 2017 20:27:20 +0200 +Subject: [PATCH 24/33] KVM: nVMX: Fix races when sending nested PI while dest + enters/leaves L2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 6b6977117f50d60455ace86b2d256f6fb4f3de05 upstream. + +Consider the following scenario: +1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI +to CPU B via virtual posted-interrupt mechanism. +2. CPU B is currently executing L2 guest. +3. vmx_deliver_nested_posted_interrupt() calls +kvm_vcpu_trigger_posted_interrupt() which will note that +vcpu->mode == IN_GUEST_MODE. +4. Assume that before CPU A sends the physical POSTED_INTR_NESTED_VECTOR +IPI, CPU B exits from L2 to L0 during event-delivery +(valid IDT-vectoring-info). +5. CPU A now sends the physical IPI. The IPI is received in host and +it's handler (smp_kvm_posted_intr_nested_ipi()) does nothing. +6. Assume that before CPU A sets pi_pending=true and KVM_REQ_EVENT, +CPU B continues to run in L0 and reach vcpu_enter_guest(). As +KVM_REQ_EVENT is not set yet, vcpu_enter_guest() will continue and resume +L2 guest. +7. At this point, CPU A sets pi_pending=true and KVM_REQ_EVENT but +it's too late! CPU B already entered L2 and KVM_REQ_EVENT will only be +consumed at next L2 entry! + +Another scenario to consider: +1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI +to CPU B via virtual posted-interrupt mechanism. +2. Assume that before CPU A calls kvm_vcpu_trigger_posted_interrupt(), +CPU B is at L0 and is about to resume into L2. Further assume that it is +in vcpu_enter_guest() after check for KVM_REQ_EVENT. +3. At this point, CPU A calls kvm_vcpu_trigger_posted_interrupt() which +will note that vcpu->mode != IN_GUEST_MODE. Therefore, do nothing and +return false. Then, will set pi_pending=true and KVM_REQ_EVENT. +4. Now CPU B continue and resumes into L2 guest without processing +the posted-interrupt until next L2 entry! + +To fix both issues, we just need to change +vmx_deliver_nested_posted_interrupt() to set pi_pending=true and +KVM_REQ_EVENT before calling kvm_vcpu_trigger_posted_interrupt(). + +It will fix the first scenario by chaging step (6) to note that +KVM_REQ_EVENT and pi_pending=true and therefore process +nested posted-interrupt. + +It will fix the second scenario by two possible ways: +1. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B has changed +vcpu->mode to IN_GUEST_MODE, physical IPI will be sent and will be received +when CPU resumes into L2. +2. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B hasn't yet +changed vcpu->mode to IN_GUEST_MODE, then after CPU B will change +vcpu->mode it will call kvm_request_pending() which will return true and +therefore force another round of vcpu_enter_guest() which will note that +KVM_REQ_EVENT and pi_pending=true and therefore process nested +posted-interrupt. + +Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") +Signed-off-by: Liran Alon <liran.alon@oracle.com> +Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com> +Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com> +[Add kvm_vcpu_kick to also handle the case where L1 doesn't intercept L2 HLT + and L2 executes HLT instruction. - Paolo] +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index c564d03..85078c7 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4944,14 +4944,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { +- /* the PIR and ON have been set by L1. */ +- kvm_vcpu_trigger_posted_interrupt(vcpu); + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ /* the PIR and ON have been set by L1. */ ++ if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) ++ kvm_vcpu_kick(vcpu); + return 0; + } + return -1; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0025-KVM-x86-Reduce-retpoline-performance-impact-in-slot_.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0025-KVM-x86-Reduce-retpoline-performance-impact-in-slot_.patch new file mode 100644 index 00000000..eb633c9c --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0025-KVM-x86-Reduce-retpoline-performance-impact-in-slot_.patch @@ -0,0 +1,103 @@ +From 15ca5afe3e56a0f80151aa4b6f06233b39736a2e Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Sat, 10 Feb 2018 23:39:24 +0000 +Subject: [PATCH 25/33] KVM/x86: Reduce retpoline performance impact in + slot_handle_level_range(), by always inlining iterator helper methods +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 928a4c39484281f8ca366f53a1db79330d058401 upstream. + +With retpoline, tight loops of "call this function for every XXX" are +very much pessimised by taking a prediction miss *every* time. This one +is by far the biggest contributor to the guest launch time with retpoline. + +By marking the iterator slot_handle_…() functions always_inline, we can +ensure that the indirect function call can be optimised away into a +direct call and it actually generates slightly smaller code because +some of the other conditionals can get optimised away too. + +Performance is now pretty close to what we see with nospectre_v2 on +the command line. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Tested-by: Filippo Sironi <sironi@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Reviewed-by: Filippo Sironi <sironi@amazon.de> +Acked-by: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: arjan.van.de.ven@intel.com +Cc: dave.hansen@intel.com +Cc: jmattson@google.com +Cc: karahmed@amazon.de +Cc: kvm@vger.kernel.org +Cc: rkrcmar@redhat.com +Link: http://lkml.kernel.org/r/1518305967-31356-4-git-send-email-dwmw@amazon.co.uk +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/mmu.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index d9c7e98..ee4af7a 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -4636,7 +4636,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) + typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + + /* The caller should hold mmu-lock before calling this function. */ +-static bool ++static __always_inline bool + slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +@@ -4666,7 +4666,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + return flush; + } + +-static bool ++static __always_inline bool + slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +@@ -4677,7 +4677,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + lock_flush_tlb); + } + +-static bool ++static __always_inline bool + slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) + { +@@ -4685,7 +4685,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + } + +-static bool ++static __always_inline bool + slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) + { +@@ -4693,7 +4693,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + } + +-static bool ++static __always_inline bool + slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) + { +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0026-KVM-x86-fix-escape-of-guest-dr6-to-the-host.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0026-KVM-x86-fix-escape-of-guest-dr6-to-the-host.patch new file mode 100644 index 00000000..38255613 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0026-KVM-x86-fix-escape-of-guest-dr6-to-the-host.patch @@ -0,0 +1,70 @@ +From 75a724909e81cd4612490d633ab269495377d332 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Wed, 13 Dec 2017 10:46:40 +0100 +Subject: [PATCH 26/33] KVM: x86: fix escape of guest dr6 to the host +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit efdab992813fb2ed825745625b83c05032e9cda2 upstream. + +syzkaller reported: + + WARNING: CPU: 0 PID: 12927 at arch/x86/kernel/traps.c:780 do_debug+0x222/0x250 + CPU: 0 PID: 12927 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #16 + RIP: 0010:do_debug+0x222/0x250 + Call Trace: + <#DB> + debug+0x3e/0x70 + RIP: 0010:copy_user_enhanced_fast_string+0x10/0x20 + </#DB> + _copy_from_user+0x5b/0x90 + SyS_timer_create+0x33/0x80 + entry_SYSCALL_64_fastpath+0x23/0x9a + +The testcase sets a watchpoint (with perf_event_open) on a buffer that is +passed to timer_create() as the struct sigevent argument. In timer_create(), +copy_from_user()'s rep movsb triggers the BP. The testcase also sets +the debug registers for the guest. + +However, KVM only restores host debug registers when the host has active +watchpoints, which triggers a race condition when running the testcase with +multiple threads. The guest's DR6.BS bit can escape to the host before +another thread invokes timer_create(), and do_debug() complains. + +The fix is to respect do_debug()'s dr6 invariant when leaving KVM. + +Reported-by: Dmitry Vyukov <dvyukov@google.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: David Hildenbrand <david@redhat.com> +Cc: Dmitry Vyukov <dvyukov@google.com> +Reviewed-by: David Hildenbrand <david@redhat.com> +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/x86.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index d2ea523..af333e1 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -2833,6 +2833,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) + kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = rdtsc(); ++ /* ++ * If userspace has set any breakpoints or watchpoints, dr6 is restored ++ * on every vmexit, but if not, we might have a stale dr6 from the ++ * guest. do_debug expects dr6 to be cleared after it runs, do the same. ++ */ ++ set_debugreg(0, 6); + } + + static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-x86-add-MULTIUSER-dependency-for-KVM.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-x86-add-MULTIUSER-dependency-for-KVM.patch new file mode 100644 index 00000000..ef01a1cb --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-x86-add-MULTIUSER-dependency-for-KVM.patch @@ -0,0 +1,37 @@ +From 216ac4ef7d2da59cd2b3d6e34e559c7ef49a143d Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann <arnd@arndb.de> +Date: Wed, 19 Jul 2017 14:53:04 +0200 +Subject: [PATCH 27/33] x86: add MULTIUSER dependency for KVM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit c2ce3f5d89d57301e2756ac325fe2ebc33bfec30 upstream. + +KVM tries to select 'TASKSTATS', which had additional dependencies: + +warning: (KVM) selects TASKSTATS which has unmet direct dependencies (NET && MULTIUSER) + +Signed-off-by: Arnd Bergmann <arnd@arndb.de> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index ab8e32f..66da97d 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -22,7 +22,7 @@ config KVM + depends on HAVE_KVM + depends on HIGH_RES_TIMERS + # for TASKSTATS/TASK_DELAY_ACCT: +- depends on NET ++ depends on NET && MULTIUSER + select PREEMPT_NOTIFIERS + select MMU_NOTIFIER + select ANON_INODES +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0028-KVM-add-X86_LOCAL_APIC-dependency.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0028-KVM-add-X86_LOCAL_APIC-dependency.patch new file mode 100644 index 00000000..5c62ba8b --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0028-KVM-add-X86_LOCAL_APIC-dependency.patch @@ -0,0 +1,41 @@ +From 7e8b0d6af232b1d642960ca4fb026a70bfaf1206 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann <arnd@arndb.de> +Date: Wed, 4 Oct 2017 12:28:18 +0200 +Subject: [PATCH 28/33] KVM: add X86_LOCAL_APIC dependency +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit e42eef4ba38806b18c4a74f0c276fb2e0b548173 upstream. + +The rework of the posted interrupt handling broke building without +support for the local APIC: + +ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined! + +That configuration is probably not particularly useful anyway, so +we can avoid the randconfig failures by adding a Kconfig dependency. + +Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts") +Signed-off-by: Arnd Bergmann <arnd@arndb.de> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index 66da97d..9150e09 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -23,6 +23,7 @@ config KVM + depends on HIGH_RES_TIMERS + # for TASKSTATS/TASK_DELAY_ACCT: + depends on NET && MULTIUSER ++ depends on X86_LOCAL_APIC + select PREEMPT_NOTIFIERS + select MMU_NOTIFIER + select ANON_INODES +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0029-KVM-async_pf-Fix-DF-due-to-inject-Page-not-Present-a.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0029-KVM-async_pf-Fix-DF-due-to-inject-Page-not-Present-a.patch new file mode 100644 index 00000000..b50d5453 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0029-KVM-async_pf-Fix-DF-due-to-inject-Page-not-Present-a.patch @@ -0,0 +1,105 @@ +From 8e13680f134458dd1b0529ccb636ae5895fa8a4d Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Thu, 14 Sep 2017 03:54:16 -0700 +Subject: [PATCH 29/33] KVM: async_pf: Fix #DF due to inject "Page not Present" + and "Page Ready" exceptions simultaneously +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 9a6e7c39810e4a8bc7fc95056cefb40583fe07ef upstream. + +qemu-system-x86-8600 [004] d..1 7205.687530: kvm_entry: vcpu 2 +qemu-system-x86-8600 [004] .... 7205.687532: kvm_exit: reason EXCEPTION_NMI rip 0xffffffffa921297d info ffffeb2c0e44e018 80000b0e +qemu-system-x86-8600 [004] .... 7205.687532: kvm_page_fault: address ffffeb2c0e44e018 error_code 0 +qemu-system-x86-8600 [004] .... 7205.687620: kvm_try_async_get_page: gva = 0xffffeb2c0e44e018, gfn = 0x427e4e +qemu-system-x86-8600 [004] .N.. 7205.687628: kvm_async_pf_not_present: token 0x8b002 gva 0xffffeb2c0e44e018 + kworker/4:2-7814 [004] .... 7205.687655: kvm_async_pf_completed: gva 0xffffeb2c0e44e018 address 0x7fcc30c4e000 +qemu-system-x86-8600 [004] .... 7205.687703: kvm_async_pf_ready: token 0x8b002 gva 0xffffeb2c0e44e018 +qemu-system-x86-8600 [004] d..1 7205.687711: kvm_entry: vcpu 2 + +After running some memory intensive workload in guest, I catch the kworker +which completes the GUP too quickly, and queues an "Page Ready" #PF exception +after the "Page not Present" exception before the next vmentry as the above +trace which will result in #DF injected to guest. + +This patch fixes it by clearing the queue for "Page not Present" if "Page Ready" +occurs before the next vmentry since the GUP has already got the required page +and shadow page table has already been fixed by "Page Ready" handler. + +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Fixes: 7c90705bf2a3 ("KVM: Inject asynchronous page fault into a PV guest if page is swapped out.") +[Changed indentation and added clearing of injected. - Radim] +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +[port from upstream v4.14-rc1, Don't assign to kvm_queued_exception::injected or + x86_exception::async_page_fault] +Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++++-------- + 1 file changed, 26 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index af333e1..9f0f7e2 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8370,6 +8370,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) + sizeof(val)); + } + ++static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) ++{ ++ ++ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, ++ sizeof(u32)); ++} ++ + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) + { +@@ -8396,6 +8403,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) + { + struct x86_exception fault; ++ u32 val; + + trace_kvm_async_pf_ready(work->arch.token, work->gva); + if (work->wakeup_all) +@@ -8403,14 +8411,24 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + else + kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + +- if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && +- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { +- fault.vector = PF_VECTOR; +- fault.error_code_valid = true; +- fault.error_code = 0; +- fault.nested_page_fault = false; +- fault.address = work->arch.token; +- kvm_inject_page_fault(vcpu, &fault); ++ if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && ++ !apf_get_user(vcpu, &val)) { ++ if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && ++ vcpu->arch.exception.pending && ++ vcpu->arch.exception.nr == PF_VECTOR && ++ !apf_put_user(vcpu, 0)) { ++ vcpu->arch.exception.pending = false; ++ vcpu->arch.exception.nr = 0; ++ vcpu->arch.exception.has_error_code = false; ++ vcpu->arch.exception.error_code = 0; ++ } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { ++ fault.vector = PF_VECTOR; ++ fault.error_code_valid = true; ++ fault.error_code = 0; ++ fault.nested_page_fault = false; ++ fault.address = work->arch.token; ++ kvm_inject_page_fault(vcpu, &fault); ++ } + } + vcpu->arch.apf.halted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0030-KVM-VMX-clean-up-declaration-of-VPID-EPT-invalidatio.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0030-KVM-VMX-clean-up-declaration-of-VPID-EPT-invalidatio.patch new file mode 100644 index 00000000..fefa3aac --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0030-KVM-VMX-clean-up-declaration-of-VPID-EPT-invalidatio.patch @@ -0,0 +1,57 @@ +From 9d11f29130341345dee37007dd76b9c4e83956a9 Mon Sep 17 00:00:00 2001 +From: Jan Dakinevich <jan.dakinevich@gmail.com> +Date: Fri, 23 Feb 2018 11:42:17 +0100 +Subject: [PATCH 30/33] KVM: VMX: clean up declaration of VPID/EPT invalidation + types +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 63f3ac48133a19110c8a3666028dbd9b1bf3dcb3 upstream + +- Remove VMX_EPT_EXTENT_INDIVIDUAL_ADDR, since there is no such type of + EPT invalidation + + - Add missing VPID types names + +Signed-off-by: Jan Dakinevich <jan.dakinevich@gmail.com> +Tested-by: Ladi Prosek <lprosek@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +[jwang: port to 4.4] +Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/vmx.h | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h +index a002b07..6899cf1 100644 +--- a/arch/x86/include/asm/vmx.h ++++ b/arch/x86/include/asm/vmx.h +@@ -399,10 +399,11 @@ enum vmcs_field { + #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) + + #define VMX_NR_VPIDS (1 << 16) ++#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 + #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 + #define VMX_VPID_EXTENT_ALL_CONTEXT 2 ++#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 + +-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 + #define VMX_EPT_EXTENT_CONTEXT 1 + #define VMX_EPT_EXTENT_GLOBAL 2 + #define VMX_EPT_EXTENT_SHIFT 24 +@@ -419,8 +420,10 @@ enum vmcs_field { + #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + + #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ ++#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ + #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ + #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ ++#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ + + #define VMX_EPT_DEFAULT_GAW 3 + #define VMX_EPT_MAX_GAW 0x4 +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0031-KVM-nVMX-invvpid-handling-improvements.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0031-KVM-nVMX-invvpid-handling-improvements.patch new file mode 100644 index 00000000..e96f0d9b --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0031-KVM-nVMX-invvpid-handling-improvements.patch @@ -0,0 +1,102 @@ +From 1d5388c0b1e6eef66d7999451bb22cddf4cc5546 Mon Sep 17 00:00:00 2001 +From: Jan Dakinevich <jan.dakinevich@gmail.com> +Date: Fri, 23 Feb 2018 11:42:18 +0100 +Subject: [PATCH 31/33] KVM: nVMX: invvpid handling improvements +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit bcdde302b8268ef7dbc4ddbdaffb5b44eafe9a1e upstream + + - Expose all invalidation types to the L1 + + - Reject invvpid instruction, if L1 passed zero vpid value to single + context invalidations + +Signed-off-by: Jan Dakinevich <jan.dakinevich@gmail.com> +Tested-by: Ladi Prosek <lprosek@redhat.com> +Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> +[jwang: port to 4.4] +Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++------------ + 1 file changed, 24 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 85078c7..f6c0568 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -142,6 +142,12 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); + + #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + ++#define VMX_VPID_EXTENT_SUPPORTED_MASK \ ++ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ ++ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ ++ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ ++ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) ++ + /* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive +@@ -2836,8 +2842,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) + */ + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | +- VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | +- VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; ++ VMX_VPID_EXTENT_SUPPORTED_MASK; + else + vmx->nested.nested_vmx_vpid_caps = 0; + +@@ -7671,7 +7676,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + +- types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; ++ types = (vmx->nested.nested_vmx_vpid_caps & ++ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; + + if (type >= 32 || !(types & (1 << type))) { + nested_vmx_failValid(vcpu, +@@ -7693,21 +7699,27 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) + } + + switch (type) { ++ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + case VMX_VPID_EXTENT_SINGLE_CONTEXT: +- /* +- * Old versions of KVM use the single-context version so we +- * have to support it; just treat it the same as all-context. +- */ ++ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: ++ if (!vpid) { ++ nested_vmx_failValid(vcpu, ++ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); ++ skip_emulated_instruction(vcpu); ++ return 1; ++ } ++ break; + case VMX_VPID_EXTENT_ALL_CONTEXT: +- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); +- nested_vmx_succeed(vcpu); + break; + default: +- /* Trap individual address invalidation invvpid calls */ +- BUG_ON(1); +- break; ++ WARN_ON_ONCE(1); ++ skip_emulated_instruction(vcpu); ++ return 1; + } + ++ __vmx_flush_tlb(vcpu, vmx->nested.vpid02); ++ nested_vmx_succeed(vcpu); ++ + skip_emulated_instruction(vcpu); + return 1; + } +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0032-KVM-x86-Remove-indirect-MSR-op-calls-from-SPEC_CTRL.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0032-KVM-x86-Remove-indirect-MSR-op-calls-from-SPEC_CTRL.patch new file mode 100644 index 00000000..4f0b4222 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0032-KVM-x86-Remove-indirect-MSR-op-calls-from-SPEC_CTRL.patch @@ -0,0 +1,105 @@ +From 0ebeae5f6b25b48c0559950e2b7c2f0a1ffd641c Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Thu, 22 Feb 2018 16:43:17 +0100 +Subject: [PATCH 32/33] KVM/x86: Remove indirect MSR op calls from SPEC_CTRL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit ecb586bd29c99fb4de599dec388658e74388daad upstream. + +Having a paravirt indirect call in the IBRS restore path is not a +good idea, since we are trying to protect from speculative execution +of bogus indirect branch targets. It is also slower, so use +native_wrmsrl() on the vmentry path too. + +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Reviewed-by: Jim Mattson <jmattson@google.com> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Cc: KarimAllah Ahmed <karahmed@amazon.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: kvm@vger.kernel.org +Cc: stable@vger.kernel.org +Fixes: d28b387fb74da95d69d2615732f50cceb38e9a4d +Link: http://lkml.kernel.org/r/20180222154318.20361-2-pbonzini@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/svm.c | 7 ++++--- + arch/x86/kvm/vmx.c | 7 ++++--- + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 4a36977..8d33396 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -44,6 +44,7 @@ + #include <asm/debugreg.h> + #include <asm/kvm_para.h> + #include <asm/irq_remapping.h> ++#include <asm/microcode.h> + #include <asm/nospec-branch.h> + + #include <asm/virtext.h> +@@ -4907,7 +4908,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + * being speculatively taken. + */ + if (svm->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + + asm volatile ( + "push %%" _ASM_BP "; \n\t" +@@ -5017,10 +5018,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) +- rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (svm->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index f6c0568..aa2684a 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -49,6 +49,7 @@ + #include <asm/kexec.h> + #include <asm/apic.h> + #include <asm/irq_remapping.h> ++#include <asm/microcode.h> + #include <asm/nospec-branch.h> + + #include "trace.h" +@@ -8888,7 +8889,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + * being speculatively taken. + */ + if (vmx->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + vmx->__launched = vmx->loaded_vmcs->launched; + asm( +@@ -9024,10 +9025,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) +- rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); ++ vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (vmx->spec_ctrl) +- wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0033-KVM-VMX-Optimize-vmx_vcpu_run-and-svm_vcpu_run-by-ma.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0033-KVM-VMX-Optimize-vmx_vcpu_run-and-svm_vcpu_run-by-ma.patch new file mode 100644 index 00000000..95086730 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0033-KVM-VMX-Optimize-vmx_vcpu_run-and-svm_vcpu_run-by-ma.patch @@ -0,0 +1,65 @@ +From 885a241a441e144391884136534657f8502b2a48 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Thu, 22 Feb 2018 16:43:18 +0100 +Subject: [PATCH 33/33] KVM/VMX: Optimize vmx_vcpu_run() and svm_vcpu_run() by + marking the RDMSR path as unlikely() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 946fbbc13dce68902f64515b610eeb2a6c3d7a64 upstream. + +vmx_vcpu_run() and svm_vcpu_run() are large functions, and giving +branch hints to the compiler can actually make a substantial cycle +difference by keeping the fast path contiguous in memory. + +With this optimization, the retpoline-guest/retpoline-host case is +about 50 cycles faster. + +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Reviewed-by: Jim Mattson <jmattson@google.com> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Cc: KarimAllah Ahmed <karahmed@amazon.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: kvm@vger.kernel.org +Cc: stable@vger.kernel.org +Link: http://lkml.kernel.org/r/20180222154318.20361-3-pbonzini@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/svm.c | 2 +- + arch/x86/kvm/vmx.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 8d33396..b82bb66 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -5017,7 +5017,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ +- if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (svm->spec_ctrl) +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index aa2684a..3c3558b 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -9024,7 +9024,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ +- if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) ++ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + if (vmx->spec_ctrl) +-- +2.7.4 + diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/upstream-backports.scc b/common/recipes-kernel/linux/linux-yocto-4.9.21/upstream-backports.scc index 06d6de30..70237be9 100644 --- a/common/recipes-kernel/linux/linux-yocto-4.9.21/upstream-backports.scc +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/upstream-backports.scc @@ -167,3 +167,36 @@ patch 0011-x86-retpoline-Support-retpoline-builds-with-Clang.patch patch 0012-x86-speculation-objtool-Annotate-indirect-calls-jump.patch patch 0013-x86-boot-objtool-Annotate-indirect-jump-in-secondary.patch patch 0014-x86-speculation-Move-firmware_restrict_branch_specul.patch +patch 0001-KVM-Fix-stack-out-of-bounds-read-in-write_mmio.patch +patch 0002-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch +patch 0003-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch +patch 0004-KVM-x86-emulator-Return-to-user-mode-on-L1-CPL-0-emu.patch +patch 0005-KVM-x86-Don-t-re-execute-instruction-when-not-passin.patch +patch 0006-KVM-X86-Fix-operand-address-size-during-instruction-.patch +patch 0007-KVM-x86-ioapic-Fix-level-triggered-EOI-and-IOAPIC-re.patch +patch 0008-KVM-x86-ioapic-Clear-Remote-IRR-when-entry-is-switch.patch +patch 0009-KVM-x86-ioapic-Preserve-read-only-values-in-the-redi.patch +patch 0010-KVM-VMX-Fix-rflags-cache-during-vCPU-reset.patch +patch 0011-KVM-x86-Make-indirect-calls-in-emulator-speculation-.patch +patch 0012-KVM-VMX-Make-indirect-call-speculation-safe.patch +patch 0013-x86-kvm-Update-spectre-v1-mitigation.patch +patch 0014-KVM-nVMX-kmap-can-t-fail.patch +patch 0015-KVM-nVMX-vmx_complete_nested_posted_interrupt-can-t-.patch +patch 0016-KVM-nVMX-mark-vmcs12-pages-dirty-on-L2-exit.patch +patch 0017-KVM-nVMX-Eliminate-vmcs02-pool.patch +patch 0018-KVM-VMX-introduce-alloc_loaded_vmcs.patch +patch 0019-KVM-VMX-make-MSR-bitmaps-per-VCPU.patch +patch 0020-KVM-x86-Add-IBPB-support.patch +patch 0021-KVM-VMX-Emulate-MSR_IA32_ARCH_CAPABILITIES.patch +patch 0022-KVM-VMX-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch +patch 0023-KVM-SVM-Allow-direct-access-to-MSR_IA32_SPEC_CTRL.patch +patch 0024-KVM-nVMX-Fix-races-when-sending-nested-PI-while-dest.patch +patch 0025-KVM-x86-Reduce-retpoline-performance-impact-in-slot_.patch +patch 0026-KVM-x86-fix-escape-of-guest-dr6-to-the-host.patch +patch 0027-x86-add-MULTIUSER-dependency-for-KVM.patch +patch 0028-KVM-add-X86_LOCAL_APIC-dependency.patch +patch 0029-KVM-async_pf-Fix-DF-due-to-inject-Page-not-Present-a.patch +patch 0030-KVM-VMX-clean-up-declaration-of-VPID-EPT-invalidatio.patch +patch 0031-KVM-nVMX-invvpid-handling-improvements.patch +patch 0032-KVM-x86-Remove-indirect-MSR-op-calls-from-SPEC_CTRL.patch +patch 0033-KVM-VMX-Optimize-vmx_vcpu_run-and-svm_vcpu_run-by-ma.patch |