aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c209
1 files changed, 127 insertions, 82 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 559634b59d2a..46ba2e03a892 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -781,7 +781,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
- if (enable_ept)
+ if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
@@ -1816,7 +1816,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
msr->data = vmx_get_perf_capabilities();
return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
}
@@ -2063,7 +2063,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
- if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
@@ -2934,14 +2934,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root_hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa));
+ ept_sync_context(construct_eptp(vcpu, root_hpa,
+ mmu->shadow_root_level));
else if (!is_guest_mode(vcpu))
vpid_sync_context(to_vmx(vcpu)->vpid);
else
@@ -3064,26 +3066,19 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
-static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
+static int vmx_get_max_tdp_level(void)
{
- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
-static int get_ept_level(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
-
- return vmx_get_tdp_level(vcpu);
-}
-
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+ eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
@@ -3093,7 +3088,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
return eptp;
}
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
+ int pgd_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3101,7 +3097,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd);
+ eptp = construct_eptp(vcpu, pgd, pgd_level);
vmcs_write64(EPT_POINTER, eptp);
if (kvm_x86_ops.tlb_remote_flush) {
@@ -4356,6 +4352,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ /*
+ * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+ * between guest and host. In that case we only care about present
+ * faults.
+ */
+ if (enable_ept) {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+ }
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -4782,18 +4788,25 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
- /* EPT won't cause page fault directly */
- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -5309,6 +5322,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
vcpu->arch.exit_qualification = exit_qualification;
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
@@ -6005,6 +6030,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6013,6 +6039,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6039,6 +6066,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
+ vcpu->run->internal.data[vcpu->run->internal.ndata++] =
+ vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6094,8 +6123,9 @@ unexpected_vmexit:
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6109,7 +6139,7 @@ unexpected_vmexit:
* information but as all relevant affected CPUs have 32KiB L1D cache size
* there is no point in doing so.
*/
-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;
@@ -6142,7 +6172,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
@@ -6628,7 +6658,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
}
}
-void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
vmx->loaded_vmcs->host_state.rsp = host_rsp;
@@ -6650,6 +6680,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_vmx *vmx)
+{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
+
+ vcpu->arch.cr2 = native_read_cr2();
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * guest_exit_irqoff() restores host context and reinstates RCU if
+ * enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ guest_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
@@ -6724,19 +6811,8 @@ reenter_guest:
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
-
- if (vcpu->arch.cr2 != read_cr2())
- write_cr2(vcpu->arch.cr2);
-
- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
-
- vcpu->arch.cr2 = read_cr2();
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, vmx);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -7229,7 +7305,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7478,42 +7554,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- struct vmcs12 *vmcs12;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t dst;
-
- if (is_guest_mode(vcpu)) {
- WARN_ON_ONCE(vmx->nested.pml_full);
-
- /*
- * Check if PML is enabled for the nested guest.
- * Whether eptp bit 6 is set is already checked
- * as part of A/D emulation.
- */
- vmcs12 = get_vmcs12(vcpu);
- if (!nested_cpu_has_pml(vmcs12))
- return 0;
-
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
- vmx->nested.pml_full = true;
- return 1;
- }
-
- gpa &= ~0xFFFull;
- dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-
- if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
- offset_in_page(dst), sizeof(gpa)))
- return 0;
-
- vmcs12->guest_pml_index--;
- }
-
- return 0;
-}
-
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
@@ -7858,7 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_bp_intercept = update_exception_bitmap,
+ .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
@@ -7918,12 +7958,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = vmx_get_tdp_level,
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
- .cpuid_update = vmx_cpuid_update,
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
@@ -7942,7 +7981,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
@@ -8070,7 +8108,7 @@ static __init int hardware_setup(void)
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, ept_lpage_level);
+ kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -8265,6 +8303,13 @@ static int __init vmx_init(void)
#endif
vmx_check_vmcs12_offsets();
+ /*
+ * Intel processors don't have problems with
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+ * it for VMX by default
+ */
+ allow_smaller_maxphyaddr = true;
+
return 0;
}
module_init(vmx_init);