aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c330
1 files changed, 211 insertions, 119 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cfd880ab841d..df6f6df902c6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,6 +90,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
@@ -295,13 +297,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -437,6 +440,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
* for #DB exceptions under VMX.
*/
vcpu->arch.dr6 ^= payload & DR6_RTM;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
vcpu->arch.cr2 = payload;
@@ -816,11 +827,25 @@ void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
vcpu->guest_xcr0_loaded = 1;
}
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.pkru);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.host_pkru);
+ }
+
if (vcpu->guest_xcr0_loaded) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -880,9 +905,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
+{
+ u64 reserved_bits = CR4_RESERVED_BITS;
+
+ if (!cpu_has(c, X86_FEATURE_XSAVE))
+ reserved_bits |= X86_CR4_OSXSAVE;
+
+ if (!cpu_has(c, X86_FEATURE_SMEP))
+ reserved_bits |= X86_CR4_SMEP;
+
+ if (!cpu_has(c, X86_FEATURE_SMAP))
+ reserved_bits |= X86_CR4_SMAP;
+
+ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
+ reserved_bits |= X86_CR4_FSGSBASE;
+
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ reserved_bits |= X86_CR4_PKE;
+
+ if (!cpu_has(c, X86_FEATURE_LA57) &&
+ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+ reserved_bits |= X86_CR4_LA57;
+
+ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+ reserved_bits |= X86_CR4_UMIP;
+
+ return reserved_bits;
+}
+
static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- if (cr4 & CR4_RESERVED_BITS)
+ if (cr4 & cr4_reserved_bits)
return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
@@ -921,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
@@ -1048,9 +1104,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -1087,9 +1145,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1250,27 +1310,18 @@ u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
/*
- * On TAA affected systems, export MDS_NO=0 when:
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
- * - Updated microcode is present. This is detected by
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
- * that VERW clears CPU buffers.
- *
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
- * mitigation and don't complain:
- *
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
- *
- * If TSX is disabled on the system, guests are also mitigated against
- * TAA and clear CPU buffer mitigation is not required for guests.
+ * On TAA affected systems:
+ * - nothing to do if TSX is disabled on the host.
+ * - we emulate TSX_CTRL if present on the host.
+ * This lets the guest use VERW to clear CPU buffers.
*/
- if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
- (data & ARCH_CAP_TSX_CTRL_MSR))
- data &= ~ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
return data;
}
-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
@@ -2372,7 +2423,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2467,43 +2521,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
return;
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st->preempted & KVM_VCPU_FLUSH_TLB);
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb(vcpu, false);
- if (vcpu->arch.st.steal.version & 1)
- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
+ vcpu->arch.st.preempted = 0;
- vcpu->arch.st.steal.version += 1;
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ st->version += 1;
smp_wmb();
- vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+ st->steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
-
smp_wmb();
- vcpu->arch.st.steal.version += 1;
+ st->version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -2570,7 +2628,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
@@ -2648,11 +2706,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS,
- sizeof(struct kvm_steal_time)))
- return 1;
-
vcpu->arch.st.msr_val = data;
if (!(data & KVM_MSR_ENABLED))
@@ -2792,7 +2845,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -2869,7 +2925,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_APICBASE:
msr_info->data = kvm_get_apic_base(vcpu);
break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
@@ -3305,9 +3361,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
@@ -3348,15 +3403,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+ if (vcpu->arch.st.preempted)
+ return;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal.preempted,
- offsetof(struct kvm_steal_time, preempted),
- sizeof(vcpu->arch.st.steal.preempted));
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3492,7 +3557,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
- if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
@@ -4285,6 +4350,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
struct kvm_nested_state kvm_state;
+ int idx;
r = -EINVAL;
if (!kvm_x86_ops->set_nested_state)
@@ -4308,7 +4374,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_GET_SUPPORTED_HV_CPUID: {
@@ -6214,11 +6282,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return r;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool write_fault_to_shadow_pgtable,
int emulation_type)
{
- gpa_t gpa = cr2;
+ gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
@@ -6232,7 +6300,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
* Write permission should be allowed since only
* write access need to be emulated.
*/
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
/*
* If the mapping is invalid in guest, let cpu retry
@@ -6289,10 +6357,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- unsigned long cr2, int emulation_type)
+ gpa_t cr2_or_gpa, int emulation_type)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
last_retry_eip = vcpu->arch.last_retry_eip;
last_retry_addr = vcpu->arch.last_retry_addr;
@@ -6321,14 +6389,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (x86_page_table_writing_insn(ctxt))
return false;
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
return false;
vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
if (!vcpu->arch.mmu->direct_map)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6472,11 +6540,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
{
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -6519,8 +6584,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
- emulation_type))
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ write_fault_to_spt,
+ emulation_type))
return EMULATE_DONE;
if (ctxt->have_exception) {
/*
@@ -6549,7 +6615,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
- if (retry_instruction(ctxt, cr2, emulation_type))
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
return EMULATE_DONE;
/* this is needed for vmware backdoor interface to work since it
@@ -6561,7 +6627,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
restart:
/* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2;
+ ctxt->exception.address = cr2_or_gpa;
r = x86_emulate_insn(ctxt);
@@ -6569,7 +6635,7 @@ restart:
return EMULATE_DONE;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
@@ -6606,7 +6672,7 @@ restart:
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
+ if (r == EMULATE_DONE && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
kvm_vcpu_do_singlestep(vcpu, &r);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -6968,33 +7034,25 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
.handle_intel_pt_intr = kvm_handle_intel_pt_intr,
};
+extern u8 __read_mostly shadow_phys_bits;
+
static void kvm_set_mmio_spte_mask(void)
{
u64 mask;
- int maxphyaddr = boot_cpu_data.x86_phys_bits;
-
- /*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
- */
/*
- * Mask the uppermost physical address bit, which would be reserved as
- * long as the supported physical address width is less than 52.
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
*/
- mask = 1ull << 51;
-
- /* Set the present bit. */
- mask |= 1ull;
-
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
- mask &= ~1ull;
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
- kvm_mmu_set_mmio_spte_mask(mask, mask);
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
#ifdef CONFIG_X86_64
@@ -7350,7 +7408,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+static int inject_pending_event(struct kvm_vcpu *vcpu)
{
int r;
@@ -7386,7 +7444,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
* from L2 to L1.
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -7448,7 +7506,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
* KVM_REQ_EVENT only on certain events and not unconditionally?
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -7759,9 +7817,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- bool blockable)
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
{
unsigned long apic_address;
@@ -7772,8 +7829,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-
- return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -7921,7 +7976,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
- if (inject_pending_event(vcpu, req_int_win) != 0)
+ if (inject_pending_event(vcpu) != 0)
req_immediate_exit = true;
else {
/* Enable SMI/NMI/IRQ window open exits if needed.
@@ -8016,8 +8071,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
- /* The preempt notifier should have taken care of the FPU already. */
- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -8137,7 +8193,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu, false);
+ kvm_x86_ops->check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -8274,12 +8330,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
+static void kvm_save_current_fpu(struct fpu *fpu)
+{
+ /*
+ * If the target FPU state is not resident in the CPU registers, just
+ * memcpy() from current, else save CPU state directly to the target.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ memcpy(&fpu->state, &current->thread.fpu.state,
+ fpu_kernel_xstate_size);
+ else
+ copy_fpregs_to_fpstate(fpu);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
+
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
@@ -8295,8 +8365,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
+
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
fpregs_mark_activate();
fpregs_unlock();
@@ -8517,6 +8588,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
@@ -8525,6 +8598,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return 0;
}
@@ -8881,6 +8956,9 @@ static void fx_init(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
kvmclock_reset(vcpu);
@@ -8941,13 +9019,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
-
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-
- kvm_x86_ops->vcpu_free(vcpu);
+ kvm_arch_vcpu_free(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9139,6 +9211,8 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
if (kvm_has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -9480,6 +9554,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
{
int i;
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
@@ -9542,11 +9623,18 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9554,6 +9642,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
return 0;
}
@@ -9793,7 +9885,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
return;
- vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
+ vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -9876,7 +9968,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
{
struct x86_exception fault;
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
@@ -9904,7 +9996,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
!apf_get_user(vcpu, &val)) {