aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c193
1 files changed, 117 insertions, 76 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4bd1bf6214ee..c93070829790 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -31,6 +31,7 @@
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
@@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
if (!vmx->disable_fb_clear)
return;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
/* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}
@@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
return;
vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
}
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
@@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
return true;
}
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
+
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ return flags;
+}
+
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
@@ -1378,8 +1397,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
/*
* No indirect branch prediction barrier needed when switching
- * the active VMCS within a guest, e.g. on nested VM-Enter.
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ * the active VMCS within a vCPU, unless IBRS is advertised to
+ * the vCPU. To minimize the number of IBPBs executed, KVM
+ * performs IBPB on nested VM-Exit (a single nested transition
+ * may switch the active VMCS multiple times).
*/
if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
indirect_branch_prediction_barrier();
@@ -1657,7 +1678,17 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu);
if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * does't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
@@ -1751,7 +1782,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
@@ -1769,7 +1800,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -2396,7 +2427,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
@@ -3212,18 +3243,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -4429,7 +4457,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4440,7 +4468,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4773,7 +4801,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
@@ -5045,21 +5073,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
vmcs_writel(GUEST_DR7, val);
}
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_cpuid(vcpu);
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_rdmsr(vcpu);
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wrmsr(vcpu);
-}
-
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
kvm_apic_update_ppr(vcpu);
@@ -5068,7 +5081,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5076,11 +5089,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_halt(vcpu);
-}
-
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
return kvm_emulate_hypercall(vcpu);
@@ -5286,7 +5294,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
WARN_ON_ONCE(!enable_vnmi);
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5307,7 +5315,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
intr_window_requested = exec_controls_get(vmx) &
- CPU_BASED_VIRTUAL_INTR_PENDING;
+ CPU_BASED_INTR_WINDOW_EXITING;
while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
@@ -5628,11 +5636,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
@@ -6006,9 +6014,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
+ && kvm_vmx_exit_handlers[exit_reason]) {
+#ifdef CONFIG_RETPOLINE
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
+ } else {
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason);
dump_vmcs();
@@ -6336,6 +6358,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
);
kvm_after_interrupt(vcpu);
+ vcpu->arch.at_instruction_boundary = true;
}
STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
@@ -6539,7 +6562,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
}
}
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
+{
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -6628,32 +6674,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
write_cr2(vcpu->arch.cr2);
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
+ __vmx_vcpu_run_flags(vmx));
vcpu->arch.cr2 = read_cr2();
vmx_enable_fb_clear(vmx);
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
-
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
current_evmcs->hv_clean_fields |=
@@ -7241,6 +7267,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
break;
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ break;
+
/* TODO: check more intercepts... */
default:
break;