diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 193 |
1 files changed, 117 insertions, 76 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bd1bf6214ee..c93070829790 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/asm.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/internal.h> @@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) return true; } +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) +{ + unsigned int flags = 0; + + if (vmx->loaded_vmcs->launched) + flags |= VMX_RUN_VMRESUME; + + /* + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free + * to change it directly without causing a vmexit. In that case read + * it after vmexit and store it in vmx->spec_ctrl. + */ + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + flags |= VMX_RUN_SAVE_SPEC_CTRL; + + return flags; +} + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -1378,8 +1397,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); @@ -1657,7 +1678,17 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + /* + * Despite the error code being architecturally defined as 32 + * bits, and the VMCS field being 32 bits, Intel CPUs and thus + * VMX don't actually supporting setting bits 31:16. Hardware + * will (should) never provide a bogus error code, but AMD CPUs + * do generate error codes with bits 31:16 set, and so KVM's + * ABI lets userspace shove in arbitrary 32-bit values. Drop + * the upper bits to avoid VM-Fail, losing information that + * does't really exist is preferable to killing the VM. + */ + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } @@ -1751,7 +1782,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) return vcpu->arch.tsc_offset - vmcs12->tsc_offset; return vcpu->arch.tsc_offset; @@ -1769,7 +1800,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) g_tsc_offset = vmcs12->tsc_offset; trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -2396,7 +2427,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | @@ -3212,18 +3243,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + ar |= (var->unusable || !var->present) << 16; return ar; } @@ -4429,7 +4457,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4440,7 +4468,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4773,7 +4801,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu) +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; @@ -5045,21 +5073,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) vmcs_writel(GUEST_DR7, val); } -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_rdmsr(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wrmsr(vcpu); -} - static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); @@ -5068,7 +5081,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5076,11 +5089,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - static int handle_vmcall(struct kvm_vcpu *vcpu) { return kvm_emulate_hypercall(vcpu); @@ -5286,7 +5294,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5307,7 +5315,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); intr_window_requested = exec_controls_get(vmx) & - CPU_BASED_VIRTUAL_INTR_PENDING; + CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) @@ -5628,11 +5636,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_CPUID] = kvm_emulate_cpuid, + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, + [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, @@ -6006,9 +6014,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) + && kvm_vmx_exit_handlers[exit_reason]) { +#ifdef CONFIG_RETPOLINE + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); +#endif return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { + } else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); dump_vmcs(); @@ -6336,6 +6358,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) ); kvm_after_interrupt(vcpu); + vcpu->arch.at_instruction_boundary = true; } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); @@ -6539,7 +6562,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + unsigned int flags) +{ + u64 hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return; + + if (flags & VMX_RUN_SAVE_SPEC_CTRL) + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * If the guest/host SPEC_CTRL values differ, restore the host value. + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + vmx->spec_ctrl != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); +} static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { @@ -6628,32 +6674,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); + __vmx_vcpu_run_flags(vmx)); vcpu->arch.cr2 = read_cr2(); vmx_enable_fb_clear(vmx); - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) current_evmcs->hv_clean_fields |= @@ -7241,6 +7267,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; + case x86_intercept_pause: + /* + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides + * with vanilla NOPs in the emulator. Apply the interception + * check only to actual PAUSE instructions. Don't check + * PAUSE-loop-exiting, software can't expect a given PAUSE to + * exit, i.e. KVM is within its rights to allow L2 to execute + * the PAUSE. + */ + if ((info->rep_prefix != REPE_PREFIX) || + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + return X86EMUL_CONTINUE; + + break; + /* TODO: check more intercepts... */ default: break; |