aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/nested.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r--arch/x86/kvm/vmx/nested.c472
1 files changed, 242 insertions, 230 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f24a2c225070..f4b1ae439763 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
if (!vmx->nested.hv_evmcs)
return;
- kunmap(vmx->nested.hv_evmcs_page);
- kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
vmx->nested.hv_evmcs_vmptr = -1ull;
- vmx->nested.hv_evmcs_page = NULL;
vmx->nested.hv_evmcs = NULL;
}
@@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
- if (vmx->nested.virtual_apic_page) {
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- if (vmx->nested.pi_desc_page) {
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -500,6 +491,17 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -508,88 +510,95 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
int msr;
- struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
- /*
- * pred_cmd & spec_ctrl are trying to verify two things:
- *
- * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
- * ensures that we do not accidentally generate an L02 MSR bitmap
- * from the L12 MSR bitmap that is too permissive.
- * 2. That L1 or L2s have actually used the MSR. This avoids
- * unnecessarily merging of the bitmap if the MSR is unused. This
- * works properly because we only update the L01 MSR bitmap lazily.
- * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
- * updated to reflect this when L1 (or its L2s) actually write to
- * the MSR.
- */
- bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
- bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+ struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
- if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
- !pred_cmd && !spec_ctrl)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
- if (is_error_page(page))
- return false;
+ msr_bitmap_l1 = (unsigned long *)map->hva;
- msr_bitmap_l1 = (unsigned long *)kmap(page);
- if (nested_cpu_has_apic_reg_virt(vmcs12)) {
- /*
- * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
- * just lets the processor take the value from the virtual-APIC page;
- * take those 256 bits directly from the L1 bitmap.
- */
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = msr_bitmap_l1[word];
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- } else {
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = ~0;
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- }
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively disable them below.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_TASKPRI),
- MSR_TYPE_W);
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
- if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_EOI),
- MSR_TYPE_W);
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_SELF_IPI),
- MSR_TYPE_W);
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
}
- if (spec_ctrl)
+ /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ /*
+ * Checking the L0->L1 bitmap is trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+ * from the L12 MSR bitmap that is too permissive.
+ * 2. That L1 or L2s have actually used the MSR. This avoids
+ * unnecessarily merging of the bitmap if the MSR is unused. This
+ * works properly because we only update the L01 MSR bitmap lazily.
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+ * updated to reflect this when L1 (or its L2s) actually write to
+ * the MSR.
+ */
+ if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_R | MSR_TYPE_W);
- if (pred_cmd)
+ if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
return true;
}
@@ -597,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct kvm_host_map map;
struct vmcs12 *shadow;
- struct page *page;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
shadow = get_shadow_vmcs12(vcpu);
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
- memcpy(shadow, kmap(page), VMCS12_SIZE);
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+ return;
- kunmap(page);
- kvm_release_page_clean(page);
+ memcpy(shadow, map.hva, VMCS12_SIZE);
+ kvm_vcpu_unmap(vcpu, &map, false);
}
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -914,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
if (!nested_cr3_valid(vcpu, cr3)) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
+ return -EINVAL;
}
/*
@@ -925,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
!nested_ept) {
if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
- return 1;
+ return -EINVAL;
}
}
}
@@ -1778,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
nested_release_evmcs(vcpu);
- vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
- vcpu, assist_page.current_nested_vmcs);
-
- if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
+ &vmx->nested.hv_evmcs_map))
return 0;
- vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+ vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
/*
* Currently, KVM only supports eVMCS version 1
@@ -2357,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
*/
if (vmx->emulation_required) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
+ return -EINVAL;
}
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
- return 1;
+ return -EINVAL;
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ kvm_rsp_write(vcpu, vmcs12->guest_rsp);
+ kvm_rip_write(vcpu, vmcs12->guest_rip);
return 0;
}
@@ -2573,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * Checks related to Host Control Registers and MSRs
- */
-static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
+ nested_check_vm_exit_controls(vcpu, vmcs12) ||
+ nested_check_vm_entry_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
{
bool ia32e;
@@ -2585,6 +2600,15 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return -EINVAL;
+
+ if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
+ is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ return -EINVAL;
+
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
+ !kvm_pat_valid(vmcs12->host_ia32_pat))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2603,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * Checks related to Guest Non-register State
- */
-static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
-{
- if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
- return -EINVAL;
-
- return 0;
-}
-
-static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
-{
- if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
- nested_check_vm_exit_controls(vcpu, vmcs12) ||
- nested_check_vm_entry_controls(vcpu, vmcs12))
- return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-
- if (nested_check_host_control_regs(vcpu, vmcs12))
- return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
-
- if (nested_check_guest_non_reg_state(vmcs12))
- return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-
- return 0;
-}
-
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int r;
- struct page *page;
+ int r = 0;
struct vmcs12 *shadow;
+ struct kvm_host_map map;
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -2645,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
- if (is_error_page(page))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
- r = 0;
- shadow = kmap(page);
+ shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
- kunmap(page);
- kvm_release_page_clean(page);
+
+ kvm_vcpu_unmap(vcpu, &map, false);
return r;
}
-static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 *exit_qual)
+/*
+ * Checks related to Guest Non-register State
+ */
+static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
+{
+ if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 *exit_qual)
{
bool ia32e;
@@ -2669,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
- return 1;
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+ !kvm_pat_valid(vmcs12->guest_ia32_pat))
+ return -EINVAL;
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
- return 1;
+ return -EINVAL;
}
/*
@@ -2692,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
((vmcs12->guest_cr0 & X86_CR0_PG) &&
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
- return 1;
+ return -EINVAL;
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
- (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
- return 1;
+ (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+ (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ return -EINVAL;
+
+ if (nested_check_guest_non_reg_state(vmcs12))
+ return -EINVAL;
return 0;
}
@@ -2811,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_host_map *map;
struct page *page;
u64 hpa;
@@ -2843,53 +2857,43 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
+ map = &vmx->nested.virtual_apic_map;
/*
* If translation failed, VM entry will fail because
* prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
- * Failing the vm entry is _not_ what the processor
- * does but it's basically the only possibility we
- * have. We could still enter the guest if CR8 load
- * exits are enabled, CR8 store exits are enabled, and
- * virtualize APIC access is disabled; in this case
- * the processor would never use the TPR shadow and we
- * could simply clear the bit from the execution
- * control. But such a configuration is useless, so
- * let's keep the code simple.
*/
- if (!is_error_page(page)) {
- vmx->nested.virtual_apic_page = page;
- hpa = page_to_phys(vmx->nested.virtual_apic_page);
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+ } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
+ nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * The processor will never use the TPR shadow, simply
+ * clear the bit from the execution control. Such a
+ * configuration is useless, but it happens in tests.
+ * For any other configuration, failing the vm entry is
+ * _not_ what the processor does but it's basically the
+ * only possibility we have.
+ */
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_TPR_SHADOW);
+ } else {
+ printk("bad virtual-APIC page address\n");
+ dump_vmcs();
}
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- if (vmx->nested.pi_desc_page) { /* shouldn't happen */
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+ map = &vmx->nested.pi_desc_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+ vmx->nested.pi_desc =
+ (struct pi_desc *)(((void *)map->hva) +
+ offset_in_page(vmcs12->posted_intr_desc_addr));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
}
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
- if (is_error_page(page))
- return;
- vmx->nested.pi_desc_page = page;
- vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc =
- (struct pi_desc *)((void *)vmx->nested.pi_desc +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
- vmcs_write64(POSTED_INTR_DESC_ADDR,
- page_to_phys(vmx->nested.pi_desc_page) +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -2972,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
return -1;
}
- if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
goto vmentry_fail_vmexit;
}
@@ -3117,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12);
- if (ret)
- return nested_vmx_failValid(vcpu, ret);
+ if (nested_vmx_check_controls(vcpu, vmcs12))
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ if (nested_vmx_check_host_state(vcpu, vmcs12))
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/*
* We're finally done with prerequisite checking, and can start with
@@ -3282,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
- vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vapic_page = vmx->nested.virtual_apic_map.hva;
+ if (!vapic_page)
+ return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
- kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -3397,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
- vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
@@ -3581,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
vmx_set_efer(vcpu, vcpu->arch.efer);
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+ kvm_rsp_write(vcpu, vmcs12->host_rsp);
+ kvm_rip_write(vcpu, vmcs12->host_rip);
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
vmx_set_interrupt_shadow(vcpu, 0);
@@ -3768,8 +3775,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
nested_ept_uninit_mmu_context(vcpu);
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
+ * points to shadow pages! Fortunately we only get here after a WARN_ON
+ * if EPT is disabled, so a VMabort is perfectly fine.
+ */
+ if (enable_ept) {
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ } else {
+ nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
+ }
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3917,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
- if (vmx->nested.virtual_apic_page) {
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- if (vmx->nested.pi_desc_page) {
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
/*
* We are now running in L2, mmu_notifier will force to reload the
@@ -4222,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
gpa_t vmptr;
- struct page *page;
+ uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -4268,20 +4278,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
* which replaces physical address width with 32
*/
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
- return nested_vmx_failInvalid(vcpu);
-
- page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failInvalid(vcpu);
- if (*(u32 *)kmap(page) != VMCS12_REVISION) {
- kunmap(page);
- kvm_release_page_clean(page);
+ if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+ revision != VMCS12_REVISION)
return nested_vmx_failInvalid(vcpu);
- }
- kunmap(page);
- kvm_release_page_clean(page);
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
@@ -4339,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
@@ -4347,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- if (vmx->nested.hv_evmcs_page) {
+ if (vmx->nested.hv_evmcs_map.hva) {
if (vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
} else {
@@ -4546,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
@@ -4559,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
if (vmx->nested.current_vmptr != vmptr) {
+ struct kvm_host_map map;
struct vmcs12 *new_vmcs12;
- struct page *page;
- page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page)) {
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -4573,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
- new_vmcs12 = kmap(page);
+
+ new_vmcs12 = map.hva;
+
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &map, false);
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4590,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* cached.
*/
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &map, false);
set_current_vmptr(vmx, vmptr);
}
@@ -4766,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 index = kvm_rcx_read(vcpu);
u64 address;
bool accessed_dirty;
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -4812,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12;
- u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+ u32 function = kvm_rax_read(vcpu);
/*
* VMFUNC is only supported for nested guests, but we always enable the
@@ -4898,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12, u32 exit_reason)
{
- u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
@@ -5335,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != 0)
return -EINVAL;
- if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
- nested_enable_evmcs(vcpu, NULL);
-
if (!nested_vmx_allowed(vcpu))
return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
@@ -5379,6 +5377,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->vmx.vmxon_pa == -1ull)
return 0;
+ if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
+ nested_enable_evmcs(vcpu, NULL);
+
vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
ret = enter_vmx_operation(vcpu);
if (ret)
@@ -5422,9 +5423,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
return 0;
- vmx->nested.nested_run_pending =
- !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
-
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
@@ -5442,14 +5440,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) ||
- nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ if (nested_vmx_check_controls(vcpu, vmcs12) ||
+ nested_vmx_check_host_state(vcpu, vmcs12) ||
+ nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
return -EINVAL;
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
ret = nested_vmx_enter_non_root_mode(vcpu, false);
- if (ret)
+ if (ret) {
+ vmx->nested.nested_run_pending = 0;
return -EINVAL;
+ }
return 0;
}
@@ -5717,6 +5721,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
+ /*
+ * Without EPT it is not possible to restore L1's CR3 and PDPTR on
+ * VMfail, because they are not available in vmcs01. Just always
+ * use hardware checks.
+ */
+ if (!enable_ept)
+ nested_early_check = 1;
+
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {